コード例 #1
0
threshold = 0.8

progress = 0

results = []

with open(os.path.join(os.path.dirname(__file__),
                       'tweets_corpus/tweet-2016-07-06-clean.csv'),
          newline='\n') as csv_input:
    dataset = csv.reader(csv_input, delimiter=',', quotechar='"')
    tweets = [(line[0], line[1], line[2]) for line in dataset]

cleaned = [(time, tweet, category, cleaner.clean(tweet))
           for (time, tweet, category) in tweets]
tokenized = [(time, tweet, category,
              tokenizer.ngrams_tokenizer(cleaned_tweets, ngrams))
             for (time, tweet, category, cleaned_tweets) in cleaned]

for (time, tweet, category, tokens) in tokenized:
    progress += 1
    print('\r{}/{}'.format(progress, len(tokenized)), end='')

    result = []

    for (time2, tweet2, category2, tokens2) in tokenized:
        dt = datetime.strptime(time, '%Y-%m-%d %H:%M:%S')
        dt2 = datetime.strptime(time2, '%Y-%m-%d %H:%M:%S')

        if category2 == 'new' and dt > dt2:
            # time_diff = dt - dt2
            score = calculation.index(tokens, tokens2)
コード例 #2
0
ngrams = 1

progress = 0

results = []

with open(os.path.join(os.path.dirname(__file__), 'aaaaa.csv'),
          newline='\n') as csv_input:
    dataset = csv.reader(csv_input, delimiter=',', quotechar='"')
    tweets = [(line[0], line[1], line[2], line[3]) for line in dataset]

cleaned_tweets = [(time, tweet, cleaner.clean(tweet), time2, tweet2,
                   cleaner.clean(tweet2))
                  for (time, tweet, time2, tweet2) in tweets]
tokenized_tweets = [
    (time, tweet, cleaned, tokenizer.ngrams_tokenizer(cleaned, ngrams), time2,
     tweet2, cleaned2, tokenizer.ngrams_tokenizer(cleaned2, ngrams))
    for (time, tweet, cleaned, time2, tweet2, cleaned2) in cleaned_tweets
]

# tfidf_input =
# tfidf_obj = tfidf.TFIDF(cleaned_tweets)

for (time, tweet, cleaned, tokens, time2, tweet2, cleaned2,
     tokens2) in tokenized_tweets:
    progress += 1
    print('\r{}/{}'.format(progress, len(tokenized_tweets)), end='')

    sm.set_seqs(cleaned, cleaned2)

    result = [
コード例 #3
0
tp, tn, fp, fn = 0, 0, 0, 0
progress = 0

for (time, tweet, category, cleaned_tweets) in cleaned:
    is_retweet = False
    for (time2, tweet2, category2, cleaned_tweets2) in cleaned:
        dt = datetime.strptime(time, '%Y-%m-%d %H:%M:%S')
        dt2 = datetime.strptime(time2, '%Y-%m-%d %H:%M:%S')

        if category2 == 'new' and dt > dt2:
            time_diff = dt - dt2
            if time_diff <= timedelta(hours=hours):
                cal_res = []
                for cal_obj in calculations:
                    tweet_tokens = tokenizer.ngrams_tokenizer(
                        cleaned_tweets, cal_obj['ngrams'])
                    tweet2_tokens = tokenizer.ngrams_tokenizer(
                        cleaned_tweets2, cal_obj['ngrams'])
                    index = cal_obj['calculation'].index(
                        tweet_tokens, tweet2_tokens)
                    cal_res.append(index >= cal_obj['threshold'])

                if cal_res.count(True) > len(calculations) / 2:
                    is_retweet = True

        if is_retweet:
            break

    if is_retweet:
        if category == 'retweet':
            tn += 1
コード例 #4
0
def calculate(hours):
    results = []
    calculation = calculations[args.calculation]
    for ngrams in range(1, 25):  # 1-6
        for threshold in [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9,
                          1.0]:  # 0.1-1.0
            start_time = tm.time()

            cleaned = [(time, tweet, category, cleaner.clean(tweet))
                       for (time, tweet, category) in tweets]
            tokenized = [(time, tweet, category,
                          tokenizer.ngrams_tokenizer(cleaned_tweets, ngrams))
                         for (time, tweet, category, cleaned_tweets) in cleaned
                         ]

            distincts = []
            tp, tn, fp, fn = 0, 0, 0, 0

            for (time, tweet, category, tokens) in tokenized:
                if len(distincts) == 0:
                    distincts.append((time, tweet, tokens))
                else:
                    is_distinct = {'text': True, 'tl': True}
                    for (distinct_time, distinct_tweet,
                         distinct_tokens) in distincts:
                        dt = datetime.strptime(time, '%Y-%m-%d %H:%M:%S')
                        distinct_dt = datetime.strptime(
                            distinct_time, '%Y-%m-%d %H:%M:%S')
                        time_diff = dt - distinct_dt

                        if time_diff > timedelta(hours=hours):
                            distincts.remove((distinct_time, distinct_tweet,
                                              distinct_tokens))
                            continue

                        index = calculation.index(tokens, distinct_tokens)
                        if index >= threshold:
                            is_distinct['text'] = False

                        if t.is_text_similar(
                                tweet,
                                distinct_tweet) and l.is_first_loc_similar(
                                    tweet, distinct_tweet):
                            is_distinct['tl'] = False

                    if is_distinct['text'] or is_distinct['tl']:
                        distincts.append((time, tweet, tokens))

                        if category == 'new':
                            tp += 1
                        else:
                            fp += 1
                    else:
                        if category == 'new':
                            fn += 1
                        else:
                            tn += 1

            time_elapsed = tm.time() - start_time
            accuracy = (tp + tn) / (tp + tn + fp + fn)
            precision = tp / (tp + fp)
            recall = tp / (tp + fn)
            fscore = 2 * (precision * recall) / (precision + recall)

            print()
            print('Limit hours: {}'.format(hours))
            print('Calculation: {}'.format(args.calculation))
            print('Ngrams: {}'.format(ngrams))
            print('Threshold: {}'.format(threshold))
            print('True positive: {}'.format(tp))
            print('True negative: {}'.format(tn))
            print('False positive: {}'.format(fp))
            print('False negative: {}'.format(fn))
            print('Accuracy: {}'.format(accuracy))
            print('Precison: {}'.format(precision))
            print('Recall: {}'.format(recall))
            print('F-score: {}'.format(fscore))
            print('Time elapsed: {}'.format(time_elapsed))

            results.append([
                args.calculation, hours, ngrams, threshold, tp, tn, fp, fn,
                accuracy, precision, recall, fscore, time_elapsed
            ])
    return results
コード例 #5
0
def calculate(hours):
    results = []
    for name, calculation in calculations.items():
        for ngrams in range(1, 7):  # 1-6
            for threshold in numpy.arange(0.1, 1.1, 0.1):  # 0.1-1.0
                start_time = tm.time()

                cleaned = [(time, tweet, category, cleaner.clean(tweet))
                           for (time, tweet, category) in tweets]
                tokenized = [
                    (time, tweet, category,
                     tokenizer.ngrams_tokenizer(cleaned_tweets, ngrams))
                    for (time, tweet, category, cleaned_tweets) in cleaned
                ]

                distincts = []
                tp, tn, fp, fn = 0, 0, 0, 0

                for (time, tweet, category, tokens) in tokenized:
                    if len(distincts) == 0:
                        distincts.append((time, tweet, tokens))
                    else:
                        is_distinct = True
                        for (distinct_time, distinct_tweet,
                             distinct_tokens) in distincts:
                            dt = datetime.strptime(time, '%Y-%m-%d %H:%M:%S')
                            distinct_dt = datetime.strptime(
                                distinct_time, '%Y-%m-%d %H:%M:%S')
                            time_diff = dt - distinct_dt

                            if time_diff > timedelta(hours=hours):
                                distincts.remove(
                                    (distinct_time, distinct_tweet,
                                     distinct_tokens))
                                continue

                            index = calculation.index(tokens, distinct_tokens)
                            if index >= threshold:
                                is_distinct = False
                                break

                        if is_distinct:
                            distincts.append((time, tweet, tokens))

                            if category == 'new':
                                tp += 1
                            else:
                                fp += 1
                        else:
                            if category == 'new':
                                fn += 1
                            else:
                                tn += 1

                time_elapsed = tm.time() - start_time
                accuracy = (tp + tn) / (tp + tn + fp + fn)
                precision = tp / (tp + fp)
                recall = tp / (tp + fn)
                fscore = 2 * (precision * recall) / (precision + recall)

                print()
                print('Limit hours: {}'.format(hours))
                print('Calculation: {}'.format(name))
                print('Ngrams: {}'.format(ngrams))
                print('Threshold: {}'.format(threshold))
                print('True positive: {}'.format(tp))
                print('True negative: {}'.format(tn))
                print('False positive: {}'.format(fp))
                print('False negative: {}'.format(fn))
                print('Accuracy: {}'.format(accuracy))
                print('Precison: {}'.format(precision))
                print('Recall: {}'.format(recall))
                print('F-score: {}'.format(fscore))
                print('Time elapsed: {}'.format(time_elapsed))

                results.append([
                    name, hours, ngrams, threshold, tp, tn, fp, fn, accuracy,
                    precision, recall, fscore, time_elapsed
                ])
    return results
コード例 #6
0
parser.add_argument('-a', '--algo', type=str, default='jaccard', help='Algorithm: jaccard, cosine')
args = parser.parse_args()

if args.algo == 'jaccard':
    algo = Jaccard()
elif args.algo == 'cosine':
    algo = Cosine()
else:
    raise Exception('Algo not defined')

with open(os.path.join(os.path.dirname(__file__), 'tweets_corpus/similarity-dataset15075.csv'), newline='\n') as csv_input:
    dataset = csv.reader(csv_input, delimiter=',', quotechar='"')
    tweets = [(line[0], line[1]) for line in dataset]

cleaned = [(time, tweet, cleaner.clean(tweet)) for (time, tweet) in tweets]
tokenized = [(time, tweet, tokenizer.ngrams_tokenizer(cleaned_tweets, args.ngrams)) for (time, tweet, cleaned_tweets) in cleaned]

distincts = []
progress = 0
with open(os.path.join(os.path.dirname(__file__), args.output), 'w', newline='\n') as csv_output:
    csv_writer = csv.writer(csv_output, delimiter=',', quotechar='"', quoting=csv.QUOTE_ALL)
    for (time, tweet, tokens) in tokenized:
        progress += 1
        print('\r{}/{}'.format(progress, len(tokenized)), end='')
        if len(distincts) == 0:
            distincts.append((time, tweet, tokens))
            csv_writer.writerow([time, tweet, '[{}]'.format(','.join(tokens))])
        else:
            is_distinct = True
            for (distinct_time, distinct_tweet, distinct_tokens) in distincts:
                dt = datetime.strptime(time, '%Y-%m-%d %H:%M:%S')
コード例 #7
0
hours = 12
calculation = Overlap()
threshold = 0.7
ngrams = 4
l = Location()

start_time = tm.time()
progress = 0

with open(os.path.join(os.path.dirname(__file__), 'tweets_corpus/similarity_dataset_15028.csv'), newline='\n') as csv_input:
    dataset = csv.reader(csv_input, delimiter=',', quotechar='"')
    tweets = [(line[0], line[1], line[2]) for line in dataset]

cleaned = [(time, tweet, category, cleaner.clean(tweet)) for (time, tweet, category) in tweets]
tokenized = [(
    time, tweet, category, tokenizer.ngrams_tokenizer(cleaned_tweets, ngrams)) for (time, tweet, category, cleaned_tweets) in cleaned]

distincts = []
tp, tn, fp, fn = 0, 0, 0, 0

for (time, tweet, category, tokens) in tokenized:
    progress += 1
    print('\r{}/{}'.format(progress, len(tokenized)), end='')

    if len(distincts) == 0:
        distincts.append((time, tweet, tokens))
    else:
        is_distinct = { 'text': True, 'tl': True }
        for (distinct_time, distinct_tweet, distinct_tokens) in distincts:
            dt = datetime.strptime(time, '%Y-%m-%d %H:%M:%S')
            distinct_dt = datetime.strptime(distinct_time, '%Y-%m-%d %H:%M:%S')