threshold = 0.8 progress = 0 results = [] with open(os.path.join(os.path.dirname(__file__), 'tweets_corpus/tweet-2016-07-06-clean.csv'), newline='\n') as csv_input: dataset = csv.reader(csv_input, delimiter=',', quotechar='"') tweets = [(line[0], line[1], line[2]) for line in dataset] cleaned = [(time, tweet, category, cleaner.clean(tweet)) for (time, tweet, category) in tweets] tokenized = [(time, tweet, category, tokenizer.ngrams_tokenizer(cleaned_tweets, ngrams)) for (time, tweet, category, cleaned_tweets) in cleaned] for (time, tweet, category, tokens) in tokenized: progress += 1 print('\r{}/{}'.format(progress, len(tokenized)), end='') result = [] for (time2, tweet2, category2, tokens2) in tokenized: dt = datetime.strptime(time, '%Y-%m-%d %H:%M:%S') dt2 = datetime.strptime(time2, '%Y-%m-%d %H:%M:%S') if category2 == 'new' and dt > dt2: # time_diff = dt - dt2 score = calculation.index(tokens, tokens2)
ngrams = 1 progress = 0 results = [] with open(os.path.join(os.path.dirname(__file__), 'aaaaa.csv'), newline='\n') as csv_input: dataset = csv.reader(csv_input, delimiter=',', quotechar='"') tweets = [(line[0], line[1], line[2], line[3]) for line in dataset] cleaned_tweets = [(time, tweet, cleaner.clean(tweet), time2, tweet2, cleaner.clean(tweet2)) for (time, tweet, time2, tweet2) in tweets] tokenized_tweets = [ (time, tweet, cleaned, tokenizer.ngrams_tokenizer(cleaned, ngrams), time2, tweet2, cleaned2, tokenizer.ngrams_tokenizer(cleaned2, ngrams)) for (time, tweet, cleaned, time2, tweet2, cleaned2) in cleaned_tweets ] # tfidf_input = # tfidf_obj = tfidf.TFIDF(cleaned_tweets) for (time, tweet, cleaned, tokens, time2, tweet2, cleaned2, tokens2) in tokenized_tweets: progress += 1 print('\r{}/{}'.format(progress, len(tokenized_tweets)), end='') sm.set_seqs(cleaned, cleaned2) result = [
tp, tn, fp, fn = 0, 0, 0, 0 progress = 0 for (time, tweet, category, cleaned_tweets) in cleaned: is_retweet = False for (time2, tweet2, category2, cleaned_tweets2) in cleaned: dt = datetime.strptime(time, '%Y-%m-%d %H:%M:%S') dt2 = datetime.strptime(time2, '%Y-%m-%d %H:%M:%S') if category2 == 'new' and dt > dt2: time_diff = dt - dt2 if time_diff <= timedelta(hours=hours): cal_res = [] for cal_obj in calculations: tweet_tokens = tokenizer.ngrams_tokenizer( cleaned_tweets, cal_obj['ngrams']) tweet2_tokens = tokenizer.ngrams_tokenizer( cleaned_tweets2, cal_obj['ngrams']) index = cal_obj['calculation'].index( tweet_tokens, tweet2_tokens) cal_res.append(index >= cal_obj['threshold']) if cal_res.count(True) > len(calculations) / 2: is_retweet = True if is_retweet: break if is_retweet: if category == 'retweet': tn += 1
def calculate(hours): results = [] calculation = calculations[args.calculation] for ngrams in range(1, 25): # 1-6 for threshold in [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]: # 0.1-1.0 start_time = tm.time() cleaned = [(time, tweet, category, cleaner.clean(tweet)) for (time, tweet, category) in tweets] tokenized = [(time, tweet, category, tokenizer.ngrams_tokenizer(cleaned_tweets, ngrams)) for (time, tweet, category, cleaned_tweets) in cleaned ] distincts = [] tp, tn, fp, fn = 0, 0, 0, 0 for (time, tweet, category, tokens) in tokenized: if len(distincts) == 0: distincts.append((time, tweet, tokens)) else: is_distinct = {'text': True, 'tl': True} for (distinct_time, distinct_tweet, distinct_tokens) in distincts: dt = datetime.strptime(time, '%Y-%m-%d %H:%M:%S') distinct_dt = datetime.strptime( distinct_time, '%Y-%m-%d %H:%M:%S') time_diff = dt - distinct_dt if time_diff > timedelta(hours=hours): distincts.remove((distinct_time, distinct_tweet, distinct_tokens)) continue index = calculation.index(tokens, distinct_tokens) if index >= threshold: is_distinct['text'] = False if t.is_text_similar( tweet, distinct_tweet) and l.is_first_loc_similar( tweet, distinct_tweet): is_distinct['tl'] = False if is_distinct['text'] or is_distinct['tl']: distincts.append((time, tweet, tokens)) if category == 'new': tp += 1 else: fp += 1 else: if category == 'new': fn += 1 else: tn += 1 time_elapsed = tm.time() - start_time accuracy = (tp + tn) / (tp + tn + fp + fn) precision = tp / (tp + fp) recall = tp / (tp + fn) fscore = 2 * (precision * recall) / (precision + recall) print() print('Limit hours: {}'.format(hours)) print('Calculation: {}'.format(args.calculation)) print('Ngrams: {}'.format(ngrams)) print('Threshold: {}'.format(threshold)) print('True positive: {}'.format(tp)) print('True negative: {}'.format(tn)) print('False positive: {}'.format(fp)) print('False negative: {}'.format(fn)) print('Accuracy: {}'.format(accuracy)) print('Precison: {}'.format(precision)) print('Recall: {}'.format(recall)) print('F-score: {}'.format(fscore)) print('Time elapsed: {}'.format(time_elapsed)) results.append([ args.calculation, hours, ngrams, threshold, tp, tn, fp, fn, accuracy, precision, recall, fscore, time_elapsed ]) return results
def calculate(hours): results = [] for name, calculation in calculations.items(): for ngrams in range(1, 7): # 1-6 for threshold in numpy.arange(0.1, 1.1, 0.1): # 0.1-1.0 start_time = tm.time() cleaned = [(time, tweet, category, cleaner.clean(tweet)) for (time, tweet, category) in tweets] tokenized = [ (time, tweet, category, tokenizer.ngrams_tokenizer(cleaned_tweets, ngrams)) for (time, tweet, category, cleaned_tweets) in cleaned ] distincts = [] tp, tn, fp, fn = 0, 0, 0, 0 for (time, tweet, category, tokens) in tokenized: if len(distincts) == 0: distincts.append((time, tweet, tokens)) else: is_distinct = True for (distinct_time, distinct_tweet, distinct_tokens) in distincts: dt = datetime.strptime(time, '%Y-%m-%d %H:%M:%S') distinct_dt = datetime.strptime( distinct_time, '%Y-%m-%d %H:%M:%S') time_diff = dt - distinct_dt if time_diff > timedelta(hours=hours): distincts.remove( (distinct_time, distinct_tweet, distinct_tokens)) continue index = calculation.index(tokens, distinct_tokens) if index >= threshold: is_distinct = False break if is_distinct: distincts.append((time, tweet, tokens)) if category == 'new': tp += 1 else: fp += 1 else: if category == 'new': fn += 1 else: tn += 1 time_elapsed = tm.time() - start_time accuracy = (tp + tn) / (tp + tn + fp + fn) precision = tp / (tp + fp) recall = tp / (tp + fn) fscore = 2 * (precision * recall) / (precision + recall) print() print('Limit hours: {}'.format(hours)) print('Calculation: {}'.format(name)) print('Ngrams: {}'.format(ngrams)) print('Threshold: {}'.format(threshold)) print('True positive: {}'.format(tp)) print('True negative: {}'.format(tn)) print('False positive: {}'.format(fp)) print('False negative: {}'.format(fn)) print('Accuracy: {}'.format(accuracy)) print('Precison: {}'.format(precision)) print('Recall: {}'.format(recall)) print('F-score: {}'.format(fscore)) print('Time elapsed: {}'.format(time_elapsed)) results.append([ name, hours, ngrams, threshold, tp, tn, fp, fn, accuracy, precision, recall, fscore, time_elapsed ]) return results
parser.add_argument('-a', '--algo', type=str, default='jaccard', help='Algorithm: jaccard, cosine') args = parser.parse_args() if args.algo == 'jaccard': algo = Jaccard() elif args.algo == 'cosine': algo = Cosine() else: raise Exception('Algo not defined') with open(os.path.join(os.path.dirname(__file__), 'tweets_corpus/similarity-dataset15075.csv'), newline='\n') as csv_input: dataset = csv.reader(csv_input, delimiter=',', quotechar='"') tweets = [(line[0], line[1]) for line in dataset] cleaned = [(time, tweet, cleaner.clean(tweet)) for (time, tweet) in tweets] tokenized = [(time, tweet, tokenizer.ngrams_tokenizer(cleaned_tweets, args.ngrams)) for (time, tweet, cleaned_tweets) in cleaned] distincts = [] progress = 0 with open(os.path.join(os.path.dirname(__file__), args.output), 'w', newline='\n') as csv_output: csv_writer = csv.writer(csv_output, delimiter=',', quotechar='"', quoting=csv.QUOTE_ALL) for (time, tweet, tokens) in tokenized: progress += 1 print('\r{}/{}'.format(progress, len(tokenized)), end='') if len(distincts) == 0: distincts.append((time, tweet, tokens)) csv_writer.writerow([time, tweet, '[{}]'.format(','.join(tokens))]) else: is_distinct = True for (distinct_time, distinct_tweet, distinct_tokens) in distincts: dt = datetime.strptime(time, '%Y-%m-%d %H:%M:%S')
hours = 12 calculation = Overlap() threshold = 0.7 ngrams = 4 l = Location() start_time = tm.time() progress = 0 with open(os.path.join(os.path.dirname(__file__), 'tweets_corpus/similarity_dataset_15028.csv'), newline='\n') as csv_input: dataset = csv.reader(csv_input, delimiter=',', quotechar='"') tweets = [(line[0], line[1], line[2]) for line in dataset] cleaned = [(time, tweet, category, cleaner.clean(tweet)) for (time, tweet, category) in tweets] tokenized = [( time, tweet, category, tokenizer.ngrams_tokenizer(cleaned_tweets, ngrams)) for (time, tweet, category, cleaned_tweets) in cleaned] distincts = [] tp, tn, fp, fn = 0, 0, 0, 0 for (time, tweet, category, tokens) in tokenized: progress += 1 print('\r{}/{}'.format(progress, len(tokenized)), end='') if len(distincts) == 0: distincts.append((time, tweet, tokens)) else: is_distinct = { 'text': True, 'tl': True } for (distinct_time, distinct_tweet, distinct_tokens) in distincts: dt = datetime.strptime(time, '%Y-%m-%d %H:%M:%S') distinct_dt = datetime.strptime(distinct_time, '%Y-%m-%d %H:%M:%S')