def get_locations(self, tweet): tweet = cleaner.clean(tweet) tagged_chunked_tweet = self.cp.parse(self.tnt_pos_tagger.tag(nltk.word_tokenize(tweet))) locations = [] for subtree in tagged_chunked_tweet.subtrees(): if subtree.label() == 'LOC': location = [] for leave in subtree.leaves(): location.append(leave[0]) locations.append(' '.join(location)) return locations
args = parser.parse_args() sm = SequenceMatcher(lambda x: x == " ") ngrams = 1 progress = 0 results = [] with open(os.path.join(os.path.dirname(__file__), 'aaaaa.csv'), newline='\n') as csv_input: dataset = csv.reader(csv_input, delimiter=',', quotechar='"') tweets = [(line[0], line[1], line[2], line[3]) for line in dataset] cleaned_tweets = [(time, tweet, cleaner.clean(tweet), time2, tweet2, cleaner.clean(tweet2)) for (time, tweet, time2, tweet2) in tweets] tokenized_tweets = [ (time, tweet, cleaned, tokenizer.ngrams_tokenizer(cleaned, ngrams), time2, tweet2, cleaned2, tokenizer.ngrams_tokenizer(cleaned2, ngrams)) for (time, tweet, cleaned, time2, tweet2, cleaned2) in cleaned_tweets ] # tfidf_input = # tfidf_obj = tfidf.TFIDF(cleaned_tweets) for (time, tweet, cleaned, tokens, time2, tweet2, cleaned2, tokens2) in tokenized_tweets: progress += 1 print('\r{}/{}'.format(progress, len(tokenized_tweets)), end='')
help='File name for output CSV, e.g. output.csv') args = parser.parse_args() threshold = 0.55 progress = 0 results = [] with open(os.path.join(os.path.dirname(__file__), 'tweets_corpus/tweet-2016-07-06-clean.csv'), newline='\n') as csv_input: dataset = csv.reader(csv_input, delimiter=',', quotechar='"') tweets = [(line[0], line[1], line[2]) for line in dataset] cleaned_tweets = [(time, tweet, category, cleaner.clean(tweet)) for (time, tweet, category) in tweets] for (time, tweet, category, cleaned) in cleaned_tweets: progress += 1 print('\r{}/{}'.format(progress, len(cleaned_tweets)), end='') result = [] for (time2, tweet2, category2, cleaned2) in cleaned_tweets: dt = datetime.strptime(time, '%Y-%m-%d %H:%M:%S') dt2 = datetime.strptime(time2, '%Y-%m-%d %H:%M:%S') if category2 == 'new' and dt > dt2: # time_diff = dt - dt2 sm.set_seqs(cleaned, cleaned2)
def calculate(hours): results = [] calculation = calculations[args.calculation] for ngrams in range(1, 25): # 1-6 for threshold in [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]: # 0.1-1.0 start_time = tm.time() cleaned = [(time, tweet, category, cleaner.clean(tweet)) for (time, tweet, category) in tweets] tokenized = [(time, tweet, category, tokenizer.ngrams_tokenizer(cleaned_tweets, ngrams)) for (time, tweet, category, cleaned_tweets) in cleaned ] distincts = [] tp, tn, fp, fn = 0, 0, 0, 0 for (time, tweet, category, tokens) in tokenized: if len(distincts) == 0: distincts.append((time, tweet, tokens)) else: is_distinct = {'text': True, 'tl': True} for (distinct_time, distinct_tweet, distinct_tokens) in distincts: dt = datetime.strptime(time, '%Y-%m-%d %H:%M:%S') distinct_dt = datetime.strptime( distinct_time, '%Y-%m-%d %H:%M:%S') time_diff = dt - distinct_dt if time_diff > timedelta(hours=hours): distincts.remove((distinct_time, distinct_tweet, distinct_tokens)) continue index = calculation.index(tokens, distinct_tokens) if index >= threshold: is_distinct['text'] = False if t.is_text_similar( tweet, distinct_tweet) and l.is_first_loc_similar( tweet, distinct_tweet): is_distinct['tl'] = False if is_distinct['text'] or is_distinct['tl']: distincts.append((time, tweet, tokens)) if category == 'new': tp += 1 else: fp += 1 else: if category == 'new': fn += 1 else: tn += 1 time_elapsed = tm.time() - start_time accuracy = (tp + tn) / (tp + tn + fp + fn) precision = tp / (tp + fp) recall = tp / (tp + fn) fscore = 2 * (precision * recall) / (precision + recall) print() print('Limit hours: {}'.format(hours)) print('Calculation: {}'.format(args.calculation)) print('Ngrams: {}'.format(ngrams)) print('Threshold: {}'.format(threshold)) print('True positive: {}'.format(tp)) print('True negative: {}'.format(tn)) print('False positive: {}'.format(fp)) print('False negative: {}'.format(fn)) print('Accuracy: {}'.format(accuracy)) print('Precison: {}'.format(precision)) print('Recall: {}'.format(recall)) print('F-score: {}'.format(fscore)) print('Time elapsed: {}'.format(time_elapsed)) results.append([ args.calculation, hours, ngrams, threshold, tp, tn, fp, fn, accuracy, precision, recall, fscore, time_elapsed ]) return results
# ('extended_jaccard', ExtendedJaccard()), ('dice', Dice()), ('manhattan', Manhattan()), # ('euclidean', Euclidean()), ('overlap', Overlap()), # ('pearson', Pearson()), # ('combination', Combination()) ] hours = 12 with open(os.path.join(os.path.dirname(__file__), 'tweets_corpus/similarity_dataset_15028.csv'), newline='\n') as csv_input: dataset = csv.reader(csv_input, delimiter=',', quotechar='"') tweets = [(line[0], line[1], line[2]) for line in dataset] cleaned_tweets = [(time, tweet, category, cleaner.clean(tweet)) for (time, tweet, category) in tweets] tfidf = TFIDF(cleaned_tweets) def calculate(calculation): results = [] for threshold in [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]: # 0.1-1.0 start_time = tm.time() distincts = [] tp, tn, fp, fn = 0, 0, 0, 0 for (time, tweet, category, cleaned) in cleaned_tweets: if len(distincts) == 0: distincts.append((time, tweet, cleaned)) else: is_distinct = True
def calculate(hours): results = [] for name, calculation in calculations.items(): for ngrams in range(1, 7): # 1-6 for threshold in numpy.arange(0.1, 1.1, 0.1): # 0.1-1.0 start_time = tm.time() cleaned = [(time, tweet, category, cleaner.clean(tweet)) for (time, tweet, category) in tweets] tokenized = [ (time, tweet, category, tokenizer.ngrams_tokenizer(cleaned_tweets, ngrams)) for (time, tweet, category, cleaned_tweets) in cleaned ] distincts = [] tp, tn, fp, fn = 0, 0, 0, 0 for (time, tweet, category, tokens) in tokenized: if len(distincts) == 0: distincts.append((time, tweet, tokens)) else: is_distinct = True for (distinct_time, distinct_tweet, distinct_tokens) in distincts: dt = datetime.strptime(time, '%Y-%m-%d %H:%M:%S') distinct_dt = datetime.strptime( distinct_time, '%Y-%m-%d %H:%M:%S') time_diff = dt - distinct_dt if time_diff > timedelta(hours=hours): distincts.remove( (distinct_time, distinct_tweet, distinct_tokens)) continue index = calculation.index(tokens, distinct_tokens) if index >= threshold: is_distinct = False break if is_distinct: distincts.append((time, tweet, tokens)) if category == 'new': tp += 1 else: fp += 1 else: if category == 'new': fn += 1 else: tn += 1 time_elapsed = tm.time() - start_time accuracy = (tp + tn) / (tp + tn + fp + fn) precision = tp / (tp + fp) recall = tp / (tp + fn) fscore = 2 * (precision * recall) / (precision + recall) print() print('Limit hours: {}'.format(hours)) print('Calculation: {}'.format(name)) print('Ngrams: {}'.format(ngrams)) print('Threshold: {}'.format(threshold)) print('True positive: {}'.format(tp)) print('True negative: {}'.format(tn)) print('False positive: {}'.format(fp)) print('False negative: {}'.format(fn)) print('Accuracy: {}'.format(accuracy)) print('Precison: {}'.format(precision)) print('Recall: {}'.format(recall)) print('F-score: {}'.format(fscore)) print('Time elapsed: {}'.format(time_elapsed)) results.append([ name, hours, ngrams, threshold, tp, tn, fp, fn, accuracy, precision, recall, fscore, time_elapsed ]) return results
parser.add_argument('-t', '--threshold', type=float, default=0.6, help='Threshold index, default: 0.6') parser.add_argument('-a', '--algo', type=str, default='jaccard', help='Algorithm: jaccard, cosine') args = parser.parse_args() if args.algo == 'jaccard': algo = Jaccard() elif args.algo == 'cosine': algo = Cosine() else: raise Exception('Algo not defined') with open(os.path.join(os.path.dirname(__file__), 'tweets_corpus/similarity-dataset15075.csv'), newline='\n') as csv_input: dataset = csv.reader(csv_input, delimiter=',', quotechar='"') tweets = [(line[0], line[1]) for line in dataset] cleaned = [(time, tweet, cleaner.clean(tweet)) for (time, tweet) in tweets] tokenized = [(time, tweet, tokenizer.ngrams_tokenizer(cleaned_tweets, args.ngrams)) for (time, tweet, cleaned_tweets) in cleaned] distincts = [] progress = 0 with open(os.path.join(os.path.dirname(__file__), args.output), 'w', newline='\n') as csv_output: csv_writer = csv.writer(csv_output, delimiter=',', quotechar='"', quoting=csv.QUOTE_ALL) for (time, tweet, tokens) in tokenized: progress += 1 print('\r{}/{}'.format(progress, len(tokenized)), end='') if len(distincts) == 0: distincts.append((time, tweet, tokens)) csv_writer.writerow([time, tweet, '[{}]'.format(','.join(tokens))]) else: is_distinct = True for (distinct_time, distinct_tweet, distinct_tokens) in distincts: