def __init__(self, input_file, output_file, rho_log, ht_log): self.lang = LanguageChecker('italian') self.hashtag = HashtagExtractor() self.annotator = AnnotationExtractor() self.italian = 0 self.annotated = 0 self.total = 0 self.requests = 0 self.coht = 0 self.rho_warn = 0 self.rho_log = gzip.open(rho_log, 'w') self.ht_log = gzip.open(ht_log, 'w') self.input_file = input_file self.output_file = output_file
def __init__(self): self.extractor = AnnotationExtractor() self.hastag = HashtagExtractor()
class HTMLRenderer(object): def __init__(self): self.extractor = AnnotationExtractor() self.hastag = HashtagExtractor() # self.portal = PortalExtractor() def run(self, inputfile, outputfile, epsilon): with gzip.open(inputfile, 'r') as input: with open(outputfile, 'w') as output: output.write(HTML_PAGE) for line in input: tweet = loads(line) #output.write("<p>%s</p>" % tweet['text'].encode('ascii', 'xmlcharrefreplace')) text = self.hastag.sanitize(tweet['text']) annotations = self.extractor.annotate(text, is_tweet=False, raw=True, epsilon=epsilon) output.write("<div class='tweet'>") # # Try a simple categorization # output.write("<div class='categories'>") # categories = self.portal.categories(map(lambda x: int(x['id']), annotations)) # if categories: # output.write((', '.join(categories)).encode('ascii', 'xmlcharrefreplace')) # else: # output.write('No categories found') # output.write("</div>\n") output.write("<div class='text'>") output.write(self.render_single(text, annotations)) output.write("</div>\n") output.write("</div>\n") output.write("</body>") def render_single(self, text, annotations): html = "" current = 0 level = 0 prev_pos = 0 must_stop = False pending = [] annotations.sort(key=lambda x: x['start']) while current < len(annotations): annotation = annotations[current] rho, id = annotation['rho'], annotation['id'] start, stop = annotation['start'], annotation['end'] spot, title = annotation['spot'], annotation['title'] if pending: while pending: last = pending[0] if start >= last['end']: html += text[:last['end']] html += '</a>' prev_pos = last['end'] pending.pop(0) else: break # We need to checkout if the next annotation is nested inside this one or not html += text[prev_pos:start] if current + 1 < len(annotations) and annotations[ current + 1]['start'] < stop: next_nested = True pending.append(annotation) pending.sort(key=lambda x: x['end']) next_pos = annotations[current + 1]['start'] else: next_nested = False next_pos = stop html += "<a href='#' data-spot='%s' data-title='%s' data-rho='%s'>%s" % \ (spot, title, rho, text[start:next_pos]) if not next_nested: html += '</a>' prev_pos = next_pos current += 1 while pending: last = pending[0] if start < last['end']: html += text[:last['end']] html += '</a>' prev_pos = last['end'] pending.pop(0) else: break html += text[prev_pos:] #return html.decode('utf8', 'ignore').encode('ascii', 'xmlcharrefreplace') return html.encode('ascii', 'xmlcharrefreplace')
class Annotator(object): def __init__(self, input_file, output_file, rho_log, ht_log): self.lang = LanguageChecker('italian') self.hashtag = HashtagExtractor() self.annotator = AnnotationExtractor() self.italian = 0 self.annotated = 0 self.total = 0 self.requests = 0 self.coht = 0 self.rho_warn = 0 self.rho_log = gzip.open(rho_log, 'w') self.ht_log = gzip.open(ht_log, 'w') self.input_file = input_file self.output_file = output_file def run(self, skip_lang=False): with gzip.open(self.output_file, 'w') as output: with gzip.open(self.input_file, 'r') as f: for line in f: json = loads(line) unstripped = json['text'] tweet_id = json['id_str'] text = self.hashtag.sanitize(unstripped) # Skip non italian tweets self.total += 1 if not skip_lang and not self.lang.is_valid(text): continue self.italian += 1 hts = self.hashtag.extract(json) # Skip text without hashtags if not hts: continue buff = self.annotate(tweet_id, unstripped, text, hts) if buff: output.write(buff) if self.annotated % 1000 == 0: sys.stderr.write("%d annotated of %d requested of %d italians of %d processed [%d warning, %d co-ht]\r" % (self.annotated, self.requests, self.italian, self.total, self.rho_warn, self.coht)) sys.stderr.flush() sys.stderr.write("%d annotated of %d requested of %d italians of %d processed [%d warning, %d co-ht]\n" % (self.annotated, self.requests, self.italian, self.total, self.rho_warn, self.coht)) sys.stderr.flush() def annotate(self, tweet_id, unstripped, text, hts): self.requests += 1 annotations = self.annotator.annotate(text) if not annotations: return "" payload = { "hts": hts, "annotations": annotations, "id": tweet_id, "tweet": text } self.annotated += 1 buff = json.dumps(payload) + '\n' for annotation in annotations: if annotation[1] == 0.5: self.rho_log.write(buff) self.rho_warn += 1 break if len(hts) >= 2: self.ht_log.write(json.dumps(hts) + '\n') self.coht += 1 return buff
class HTMLRenderer(object): def __init__(self): self.extractor = AnnotationExtractor() self.hastag = HashtagExtractor() # self.portal = PortalExtractor() def run(self, inputfile, outputfile, epsilon): with gzip.open(inputfile, "r") as input: with open(outputfile, "w") as output: output.write(HTML_PAGE) for line in input: tweet = loads(line) # output.write("<p>%s</p>" % tweet['text'].encode('ascii', 'xmlcharrefreplace')) text = self.hastag.sanitize(tweet["text"]) annotations = self.extractor.annotate(text, is_tweet=False, raw=True, epsilon=epsilon) output.write("<div class='tweet'>") # # Try a simple categorization # output.write("<div class='categories'>") # categories = self.portal.categories(map(lambda x: int(x['id']), annotations)) # if categories: # output.write((', '.join(categories)).encode('ascii', 'xmlcharrefreplace')) # else: # output.write('No categories found') # output.write("</div>\n") output.write("<div class='text'>") output.write(self.render_single(text, annotations)) output.write("</div>\n") output.write("</div>\n") output.write("</body>") def render_single(self, text, annotations): html = "" current = 0 level = 0 prev_pos = 0 must_stop = False pending = [] annotations.sort(key=lambda x: x["start"]) while current < len(annotations): annotation = annotations[current] rho, id = annotation["rho"], annotation["id"] start, stop = annotation["start"], annotation["end"] spot, title = annotation["spot"], annotation["title"] if pending: while pending: last = pending[0] if start >= last["end"]: html += text[: last["end"]] html += "</a>" prev_pos = last["end"] pending.pop(0) else: break # We need to checkout if the next annotation is nested inside this one or not html += text[prev_pos:start] if current + 1 < len(annotations) and annotations[current + 1]["start"] < stop: next_nested = True pending.append(annotation) pending.sort(key=lambda x: x["end"]) next_pos = annotations[current + 1]["start"] else: next_nested = False next_pos = stop html += "<a href='#' data-spot='%s' data-title='%s' data-rho='%s'>%s" % ( spot, title, rho, text[start:next_pos], ) if not next_nested: html += "</a>" prev_pos = next_pos current += 1 while pending: last = pending[0] if start < last["end"]: html += text[: last["end"]] html += "</a>" prev_pos = last["end"] pending.pop(0) else: break html += text[prev_pos:] # return html.decode('utf8', 'ignore').encode('ascii', 'xmlcharrefreplace') return html.encode("ascii", "xmlcharrefreplace")
class Annotator(object): def __init__(self, input_file, output_file, rho_log, ht_log): self.lang = LanguageChecker('italian') self.hashtag = HashtagExtractor() self.annotator = AnnotationExtractor() self.italian = 0 self.annotated = 0 self.total = 0 self.requests = 0 self.coht = 0 self.rho_warn = 0 self.rho_log = gzip.open(rho_log, 'w') self.ht_log = gzip.open(ht_log, 'w') self.input_file = input_file self.output_file = output_file def run(self, skip_lang=False): with gzip.open(self.output_file, 'w') as output: with gzip.open(self.input_file, 'r') as f: for line in f: json = loads(line) unstripped = json['text'] tweet_id = json['id_str'] text = self.hashtag.sanitize(unstripped) # Skip non italian tweets self.total += 1 if not skip_lang and not self.lang.is_valid(text): continue self.italian += 1 hts = self.hashtag.extract(json) # Skip text without hashtags if not hts: continue buff = self.annotate(tweet_id, unstripped, text, hts) if buff: output.write(buff) if self.annotated % 1000 == 0: sys.stderr.write( "%d annotated of %d requested of %d italians of %d processed [%d warning, %d co-ht]\r" % (self.annotated, self.requests, self.italian, self.total, self.rho_warn, self.coht)) sys.stderr.flush() sys.stderr.write( "%d annotated of %d requested of %d italians of %d processed [%d warning, %d co-ht]\n" % (self.annotated, self.requests, self.italian, self.total, self.rho_warn, self.coht)) sys.stderr.flush() def annotate(self, tweet_id, unstripped, text, hts): self.requests += 1 annotations = self.annotator.annotate(text) if not annotations: return "" payload = { "hts": hts, "annotations": annotations, "id": tweet_id, "tweet": text } self.annotated += 1 buff = json.dumps(payload) + '\n' for annotation in annotations: if annotation[1] == 0.5: self.rho_log.write(buff) self.rho_warn += 1 break if len(hts) >= 2: self.ht_log.write(json.dumps(hts) + '\n') self.coht += 1 return buff