コード例 #1
0
ファイル: random_sample.py プロジェクト: zymITsky/twittomatic
def main(filename, outfilename, count, stop_after):
    sample = []
    checker = LanguageChecker('italian')
    hashtag = HashtagExtractor()

    print "Extracting a sample of %d tweets. Stopping after %d tweets" % (
        count, stop_after)

    with gzip.open(filename, 'r') as input:
        for idx, line in enumerate(input):
            jobj = json.loads(line)

            # Check that the tweet is actually italian
            if not checker.is_valid(jobj['text']):
                continue

            if not hashtag.extract(jobj):
                continue

            if len(sample) < count:
                sample.append(jobj)
            else:
                r = random.randint(0, idx)

                if r < count:
                    sample[r] = jobj

            if idx >= stop_after and len(sample) >= count:
                break

    with gzip.open(outfilename, 'w') as output:
        for jobj in sample:
            output.write(json.dumps(jobj) + "\n")
コード例 #2
0
    def __init__(self, input_file, output_file, rho_log, ht_log):
        self.lang = LanguageChecker('italian')
        self.hashtag = HashtagExtractor()
        self.annotator = AnnotationExtractor()

        self.italian = 0
        self.annotated = 0

        self.total = 0
        self.requests = 0

        self.coht = 0
        self.rho_warn = 0

        self.rho_log = gzip.open(rho_log, 'w')
        self.ht_log = gzip.open(ht_log, 'w')

        self.input_file = input_file
        self.output_file = output_file
コード例 #3
0
 def __init__(self):
     self.extractor = AnnotationExtractor()
     self.hastag = HashtagExtractor()