Ejemplo n.º 1
0
class HTMLRenderer(object):
    def __init__(self):
        self.extractor = AnnotationExtractor()
        self.hastag = HashtagExtractor()
        # self.portal = PortalExtractor()

    def run(self, inputfile, outputfile, epsilon):
        with gzip.open(inputfile, 'r') as input:
            with open(outputfile, 'w') as output:
                output.write(HTML_PAGE)

                for line in input:
                    tweet = loads(line)
                    #output.write("<p>%s</p>" % tweet['text'].encode('ascii', 'xmlcharrefreplace'))

                    text = self.hastag.sanitize(tweet['text'])
                    annotations = self.extractor.annotate(text,
                                                          is_tweet=False,
                                                          raw=True,
                                                          epsilon=epsilon)

                    output.write("<div class='tweet'>")

                    # # Try a simple categorization
                    # output.write("<div class='categories'>")
                    # categories = self.portal.categories(map(lambda x: int(x['id']), annotations))

                    # if categories:
                    #     output.write((', '.join(categories)).encode('ascii', 'xmlcharrefreplace'))
                    # else:
                    #     output.write('No categories found')

                    # output.write("</div>\n")

                    output.write("<div class='text'>")
                    output.write(self.render_single(text, annotations))
                    output.write("</div>\n")

                    output.write("</div>\n")

                output.write("</body>")

    def render_single(self, text, annotations):
        html = ""
        current = 0
        level = 0
        prev_pos = 0
        must_stop = False
        pending = []
        annotations.sort(key=lambda x: x['start'])

        while current < len(annotations):
            annotation = annotations[current]
            rho, id = annotation['rho'], annotation['id']
            start, stop = annotation['start'], annotation['end']
            spot, title = annotation['spot'], annotation['title']

            if pending:
                while pending:
                    last = pending[0]

                    if start >= last['end']:
                        html += text[:last['end']]
                        html += '</a>'
                        prev_pos = last['end']
                        pending.pop(0)
                    else:
                        break

            # We need to checkout if the next annotation is nested inside this one or not
            html += text[prev_pos:start]

            if current + 1 < len(annotations) and annotations[
                    current + 1]['start'] < stop:
                next_nested = True
                pending.append(annotation)
                pending.sort(key=lambda x: x['end'])
                next_pos = annotations[current + 1]['start']
            else:
                next_nested = False
                next_pos = stop

            html += "<a href='#' data-spot='%s' data-title='%s' data-rho='%s'>%s" % \
                (spot, title, rho, text[start:next_pos])

            if not next_nested:
                html += '</a>'

            prev_pos = next_pos
            current += 1

        while pending:
            last = pending[0]

            if start < last['end']:
                html += text[:last['end']]
                html += '</a>'
                prev_pos = last['end']
                pending.pop(0)
            else:
                break

        html += text[prev_pos:]

        #return html.decode('utf8', 'ignore').encode('ascii', 'xmlcharrefreplace')
        return html.encode('ascii', 'xmlcharrefreplace')
class HTMLRenderer(object):
    def __init__(self):
        self.extractor = AnnotationExtractor()
        self.hastag = HashtagExtractor()
        # self.portal = PortalExtractor()

    def run(self, inputfile, outputfile, epsilon):
        with gzip.open(inputfile, "r") as input:
            with open(outputfile, "w") as output:
                output.write(HTML_PAGE)

                for line in input:
                    tweet = loads(line)
                    # output.write("<p>%s</p>" % tweet['text'].encode('ascii', 'xmlcharrefreplace'))

                    text = self.hastag.sanitize(tweet["text"])
                    annotations = self.extractor.annotate(text, is_tweet=False, raw=True, epsilon=epsilon)

                    output.write("<div class='tweet'>")

                    # # Try a simple categorization
                    # output.write("<div class='categories'>")
                    # categories = self.portal.categories(map(lambda x: int(x['id']), annotations))

                    # if categories:
                    #     output.write((', '.join(categories)).encode('ascii', 'xmlcharrefreplace'))
                    # else:
                    #     output.write('No categories found')

                    # output.write("</div>\n")

                    output.write("<div class='text'>")
                    output.write(self.render_single(text, annotations))
                    output.write("</div>\n")

                    output.write("</div>\n")

                output.write("</body>")

    def render_single(self, text, annotations):
        html = ""
        current = 0
        level = 0
        prev_pos = 0
        must_stop = False
        pending = []
        annotations.sort(key=lambda x: x["start"])

        while current < len(annotations):
            annotation = annotations[current]
            rho, id = annotation["rho"], annotation["id"]
            start, stop = annotation["start"], annotation["end"]
            spot, title = annotation["spot"], annotation["title"]

            if pending:
                while pending:
                    last = pending[0]

                    if start >= last["end"]:
                        html += text[: last["end"]]
                        html += "</a>"
                        prev_pos = last["end"]
                        pending.pop(0)
                    else:
                        break

            # We need to checkout if the next annotation is nested inside this one or not
            html += text[prev_pos:start]

            if current + 1 < len(annotations) and annotations[current + 1]["start"] < stop:
                next_nested = True
                pending.append(annotation)
                pending.sort(key=lambda x: x["end"])
                next_pos = annotations[current + 1]["start"]
            else:
                next_nested = False
                next_pos = stop

            html += "<a href='#' data-spot='%s' data-title='%s' data-rho='%s'>%s" % (
                spot,
                title,
                rho,
                text[start:next_pos],
            )

            if not next_nested:
                html += "</a>"

            prev_pos = next_pos
            current += 1

        while pending:
            last = pending[0]

            if start < last["end"]:
                html += text[: last["end"]]
                html += "</a>"
                prev_pos = last["end"]
                pending.pop(0)
            else:
                break

        html += text[prev_pos:]

        # return html.decode('utf8', 'ignore').encode('ascii', 'xmlcharrefreplace')
        return html.encode("ascii", "xmlcharrefreplace")
Ejemplo n.º 3
0
class Annotator(object):
    def __init__(self, input_file, output_file, rho_log, ht_log):
        self.lang = LanguageChecker('italian')
        self.hashtag = HashtagExtractor()
        self.annotator = AnnotationExtractor()

        self.italian = 0
        self.annotated = 0

        self.total = 0
        self.requests = 0

        self.coht = 0
        self.rho_warn = 0

        self.rho_log = gzip.open(rho_log, 'w')
        self.ht_log = gzip.open(ht_log, 'w')

        self.input_file = input_file
        self.output_file = output_file

    def run(self, skip_lang=False):
        with gzip.open(self.output_file, 'w') as output:
            with gzip.open(self.input_file, 'r') as f:
                for line in f:
                    json = loads(line)

                    unstripped = json['text']
                    tweet_id = json['id_str']
                    text = self.hashtag.sanitize(unstripped)

                    # Skip non italian tweets
                    self.total += 1

                    if not skip_lang and not self.lang.is_valid(text):
                        continue

                    self.italian += 1

                    hts = self.hashtag.extract(json)

                    # Skip text without hashtags
                    if not hts:
                        continue

                    buff = self.annotate(tweet_id, unstripped, text, hts)

                    if buff:
                        output.write(buff)

                    if self.annotated % 1000 == 0:
                        sys.stderr.write("%d annotated of %d requested of %d italians of %d processed [%d warning, %d co-ht]\r" % (self.annotated, self.requests, self.italian, self.total, self.rho_warn, self.coht))
                        sys.stderr.flush()

        sys.stderr.write("%d annotated of %d requested of %d italians of %d processed [%d warning, %d co-ht]\n" % (self.annotated, self.requests, self.italian, self.total, self.rho_warn, self.coht))
        sys.stderr.flush()

    def annotate(self, tweet_id, unstripped, text, hts):
        self.requests += 1
        annotations = self.annotator.annotate(text)

        if not annotations:
            return ""

        payload = {
            "hts": hts,
            "annotations": annotations,
            "id": tweet_id,
            "tweet": text
        }

        self.annotated += 1
        buff = json.dumps(payload) + '\n'

        for annotation in annotations:
            if annotation[1] == 0.5:
                self.rho_log.write(buff)
                self.rho_warn += 1
                break

        if len(hts) >= 2:
            self.ht_log.write(json.dumps(hts) + '\n')
            self.coht += 1

        return buff
Ejemplo n.º 4
0
class Annotator(object):
    def __init__(self, input_file, output_file, rho_log, ht_log):
        self.lang = LanguageChecker('italian')
        self.hashtag = HashtagExtractor()
        self.annotator = AnnotationExtractor()

        self.italian = 0
        self.annotated = 0

        self.total = 0
        self.requests = 0

        self.coht = 0
        self.rho_warn = 0

        self.rho_log = gzip.open(rho_log, 'w')
        self.ht_log = gzip.open(ht_log, 'w')

        self.input_file = input_file
        self.output_file = output_file

    def run(self, skip_lang=False):
        with gzip.open(self.output_file, 'w') as output:
            with gzip.open(self.input_file, 'r') as f:
                for line in f:
                    json = loads(line)

                    unstripped = json['text']
                    tweet_id = json['id_str']
                    text = self.hashtag.sanitize(unstripped)

                    # Skip non italian tweets
                    self.total += 1

                    if not skip_lang and not self.lang.is_valid(text):
                        continue

                    self.italian += 1

                    hts = self.hashtag.extract(json)

                    # Skip text without hashtags
                    if not hts:
                        continue

                    buff = self.annotate(tweet_id, unstripped, text, hts)

                    if buff:
                        output.write(buff)

                    if self.annotated % 1000 == 0:
                        sys.stderr.write(
                            "%d annotated of %d requested of %d italians of %d processed [%d warning, %d co-ht]\r"
                            % (self.annotated, self.requests, self.italian,
                               self.total, self.rho_warn, self.coht))
                        sys.stderr.flush()

        sys.stderr.write(
            "%d annotated of %d requested of %d italians of %d processed [%d warning, %d co-ht]\n"
            % (self.annotated, self.requests, self.italian, self.total,
               self.rho_warn, self.coht))
        sys.stderr.flush()

    def annotate(self, tweet_id, unstripped, text, hts):
        self.requests += 1
        annotations = self.annotator.annotate(text)

        if not annotations:
            return ""

        payload = {
            "hts": hts,
            "annotations": annotations,
            "id": tweet_id,
            "tweet": text
        }

        self.annotated += 1
        buff = json.dumps(payload) + '\n'

        for annotation in annotations:
            if annotation[1] == 0.5:
                self.rho_log.write(buff)
                self.rho_warn += 1
                break

        if len(hts) >= 2:
            self.ht_log.write(json.dumps(hts) + '\n')
            self.coht += 1

        return buff