Ejemplo n.º 1
0
def main(filename, outfilename, count, stop_after):
    sample = []
    checker = LanguageChecker('italian')
    hashtag = HashtagExtractor()

    print "Extracting a sample of %d tweets. Stopping after %d tweets" % (
        count, stop_after)

    with gzip.open(filename, 'r') as input:
        for idx, line in enumerate(input):
            jobj = json.loads(line)

            # Check that the tweet is actually italian
            if not checker.is_valid(jobj['text']):
                continue

            if not hashtag.extract(jobj):
                continue

            if len(sample) < count:
                sample.append(jobj)
            else:
                r = random.randint(0, idx)

                if r < count:
                    sample[r] = jobj

            if idx >= stop_after and len(sample) >= count:
                break

    with gzip.open(outfilename, 'w') as output:
        for jobj in sample:
            output.write(json.dumps(jobj) + "\n")
Ejemplo n.º 2
0
def main(filename, outfilename, count, stop_after):
    sample = []
    checker = LanguageChecker('italian')
    hashtag = HashtagExtractor()

    print "Extracting a sample of %d tweets. Stopping after %d tweets" % (count, stop_after)

    with gzip.open(filename, 'r') as input:
        for idx, line in enumerate(input):
            jobj = json.loads(line)

            # Check that the tweet is actually italian
            if not checker.is_valid(jobj['text']):
                continue

            if not hashtag.extract(jobj):
                continue

            if len(sample) < count:
                sample.append(jobj)
            else:
                r = random.randint(0, idx)

                if r < count:
                    sample[r] = jobj

            if idx >= stop_after and len(sample) >= count:
                break

    with gzip.open(outfilename, 'w') as output:
        for jobj in sample:
            output.write(json.dumps(jobj) + "\n")
Ejemplo n.º 3
0
    def __init__(self, input_file, output_file, rho_log, ht_log):
        self.lang = LanguageChecker('italian')
        self.hashtag = HashtagExtractor()
        self.annotator = AnnotationExtractor()

        self.italian = 0
        self.annotated = 0

        self.total = 0
        self.requests = 0

        self.coht = 0
        self.rho_warn = 0

        self.rho_log = gzip.open(rho_log, 'w')
        self.ht_log = gzip.open(ht_log, 'w')

        self.input_file = input_file
        self.output_file = output_file
Ejemplo n.º 4
0
    def __init__(self, input_file, output_file, rho_log, ht_log):
        self.lang = LanguageChecker('italian')
        self.hashtag = HashtagExtractor()
        self.annotator = AnnotationExtractor()

        self.italian = 0
        self.annotated = 0

        self.total = 0
        self.requests = 0

        self.coht = 0
        self.rho_warn = 0

        self.rho_log = gzip.open(rho_log, 'w')
        self.ht_log = gzip.open(ht_log, 'w')

        self.input_file = input_file
        self.output_file = output_file
Ejemplo n.º 5
0
class StreamMonitor(object):
    def __init__(self, username, password, output):
        self.auth = (username, password)
        self.output = output
        self.checker = LanguageChecker('italian')

    def run(self):
        r = requests.get(SAMPLE_URL, auth=self.auth, prefetch=False)

        with gzip.open(self.output, 'a') as output:
            for line in r.iter_lines():
                if not line:
                    continue

                tweet = json.loads(line)

                if 'text' in tweet and \
                   tweet['user']['lang'] == 'it' and \
                   self.checker.is_valid(tweet['text']) and \
                   len(tweet['entities']['hashtags']) > 1:
                    output.write(line + "\n")
                    print tweet['text']
Ejemplo n.º 6
0
class StreamMonitor(object):
    def __init__(self, username, password, output):
        self.auth = (username, password)
        self.output = output
        self.checker = LanguageChecker("italian")

    def run(self):
        r = requests.get(SAMPLE_URL, auth=self.auth, prefetch=False)

        with gzip.open(self.output, "a") as output:
            for line in r.iter_lines():
                if not line:
                    continue

                tweet = json.loads(line)

                if (
                    "text" in tweet
                    and tweet["user"]["lang"] == "it"
                    and self.checker.is_valid(tweet["text"])
                    and len(tweet["entities"]["hashtags"]) > 1
                ):
                    output.write(line + "\n")
                    print tweet["text"]
Ejemplo n.º 7
0
 def __init__(self, username, password, output):
     self.auth = (username, password)
     self.output = output
     self.checker = LanguageChecker("italian")
Ejemplo n.º 8
0
 def __init__(self, username, password, output):
     self.auth = (username, password)
     self.output = output
     self.checker = LanguageChecker('italian')
Ejemplo n.º 9
0
class Annotator(object):
    def __init__(self, input_file, output_file, rho_log, ht_log):
        self.lang = LanguageChecker('italian')
        self.hashtag = HashtagExtractor()
        self.annotator = AnnotationExtractor()

        self.italian = 0
        self.annotated = 0

        self.total = 0
        self.requests = 0

        self.coht = 0
        self.rho_warn = 0

        self.rho_log = gzip.open(rho_log, 'w')
        self.ht_log = gzip.open(ht_log, 'w')

        self.input_file = input_file
        self.output_file = output_file

    def run(self, skip_lang=False):
        with gzip.open(self.output_file, 'w') as output:
            with gzip.open(self.input_file, 'r') as f:
                for line in f:
                    json = loads(line)

                    unstripped = json['text']
                    tweet_id = json['id_str']
                    text = self.hashtag.sanitize(unstripped)

                    # Skip non italian tweets
                    self.total += 1

                    if not skip_lang and not self.lang.is_valid(text):
                        continue

                    self.italian += 1

                    hts = self.hashtag.extract(json)

                    # Skip text without hashtags
                    if not hts:
                        continue

                    buff = self.annotate(tweet_id, unstripped, text, hts)

                    if buff:
                        output.write(buff)

                    if self.annotated % 1000 == 0:
                        sys.stderr.write("%d annotated of %d requested of %d italians of %d processed [%d warning, %d co-ht]\r" % (self.annotated, self.requests, self.italian, self.total, self.rho_warn, self.coht))
                        sys.stderr.flush()

        sys.stderr.write("%d annotated of %d requested of %d italians of %d processed [%d warning, %d co-ht]\n" % (self.annotated, self.requests, self.italian, self.total, self.rho_warn, self.coht))
        sys.stderr.flush()

    def annotate(self, tweet_id, unstripped, text, hts):
        self.requests += 1
        annotations = self.annotator.annotate(text)

        if not annotations:
            return ""

        payload = {
            "hts": hts,
            "annotations": annotations,
            "id": tweet_id,
            "tweet": text
        }

        self.annotated += 1
        buff = json.dumps(payload) + '\n'

        for annotation in annotations:
            if annotation[1] == 0.5:
                self.rho_log.write(buff)
                self.rho_warn += 1
                break

        if len(hts) >= 2:
            self.ht_log.write(json.dumps(hts) + '\n')
            self.coht += 1

        return buff
Ejemplo n.º 10
0
class Annotator(object):
    def __init__(self, input_file, output_file, rho_log, ht_log):
        self.lang = LanguageChecker('italian')
        self.hashtag = HashtagExtractor()
        self.annotator = AnnotationExtractor()

        self.italian = 0
        self.annotated = 0

        self.total = 0
        self.requests = 0

        self.coht = 0
        self.rho_warn = 0

        self.rho_log = gzip.open(rho_log, 'w')
        self.ht_log = gzip.open(ht_log, 'w')

        self.input_file = input_file
        self.output_file = output_file

    def run(self, skip_lang=False):
        with gzip.open(self.output_file, 'w') as output:
            with gzip.open(self.input_file, 'r') as f:
                for line in f:
                    json = loads(line)

                    unstripped = json['text']
                    tweet_id = json['id_str']
                    text = self.hashtag.sanitize(unstripped)

                    # Skip non italian tweets
                    self.total += 1

                    if not skip_lang and not self.lang.is_valid(text):
                        continue

                    self.italian += 1

                    hts = self.hashtag.extract(json)

                    # Skip text without hashtags
                    if not hts:
                        continue

                    buff = self.annotate(tweet_id, unstripped, text, hts)

                    if buff:
                        output.write(buff)

                    if self.annotated % 1000 == 0:
                        sys.stderr.write(
                            "%d annotated of %d requested of %d italians of %d processed [%d warning, %d co-ht]\r"
                            % (self.annotated, self.requests, self.italian,
                               self.total, self.rho_warn, self.coht))
                        sys.stderr.flush()

        sys.stderr.write(
            "%d annotated of %d requested of %d italians of %d processed [%d warning, %d co-ht]\n"
            % (self.annotated, self.requests, self.italian, self.total,
               self.rho_warn, self.coht))
        sys.stderr.flush()

    def annotate(self, tweet_id, unstripped, text, hts):
        self.requests += 1
        annotations = self.annotator.annotate(text)

        if not annotations:
            return ""

        payload = {
            "hts": hts,
            "annotations": annotations,
            "id": tweet_id,
            "tweet": text
        }

        self.annotated += 1
        buff = json.dumps(payload) + '\n'

        for annotation in annotations:
            if annotation[1] == 0.5:
                self.rho_log.write(buff)
                self.rho_warn += 1
                break

        if len(hts) >= 2:
            self.ht_log.write(json.dumps(hts) + '\n')
            self.coht += 1

        return buff