def main(filename, outfilename, count, stop_after): sample = [] checker = LanguageChecker('italian') hashtag = HashtagExtractor() print "Extracting a sample of %d tweets. Stopping after %d tweets" % ( count, stop_after) with gzip.open(filename, 'r') as input: for idx, line in enumerate(input): jobj = json.loads(line) # Check that the tweet is actually italian if not checker.is_valid(jobj['text']): continue if not hashtag.extract(jobj): continue if len(sample) < count: sample.append(jobj) else: r = random.randint(0, idx) if r < count: sample[r] = jobj if idx >= stop_after and len(sample) >= count: break with gzip.open(outfilename, 'w') as output: for jobj in sample: output.write(json.dumps(jobj) + "\n")
def __init__(self, input_file, output_file, rho_log, ht_log): self.lang = LanguageChecker('italian') self.hashtag = HashtagExtractor() self.annotator = AnnotationExtractor() self.italian = 0 self.annotated = 0 self.total = 0 self.requests = 0 self.coht = 0 self.rho_warn = 0 self.rho_log = gzip.open(rho_log, 'w') self.ht_log = gzip.open(ht_log, 'w') self.input_file = input_file self.output_file = output_file
def __init__(self): self.extractor = AnnotationExtractor() self.hastag = HashtagExtractor()