def main(e): start = '19980101000000' end = update_time(start, 3) path = '../dataset_files/train.jsonl.gz' print('opening file') with jsonl.open(path, gzip=True) as file: data = file.read() file.close() print('getting identifier') identifier = get_identifier(True) print('starting clustering') count = 0 while start < '20180000000000': archives = window(data, start, end) matrix = get_tfidf(archives, identifier) if matrix.shape[0] == 0: group([], []) start = update_time(start, 1) end = update_time(start, 3) count += 1 continue db = cluster(matrix, e) group(db.labels_, archives) start = update_time(start, 1) end = update_time(start, 3) count += 1 print(count)
def cluster_sampling(): """Clusters sample data""" start = '20160610000000' end = update_time(start, 3) path = '../dataset_files/train.jsonl.gz' print('opening file') with jsonl.open(path, gzip=True) as file: data = file.read() print('getting identifier') identifier = get_identifier(False) print('starting clustering') while start < '20160615000000': archives = window(data, start, end) matrix = get_tfidf(archives, identifier) if matrix.shape[0] == 0: group([], []) start = update_time(start, 1) end = update_time(start, 3) count += 1 continue db = cluster(matrix, 0.93) group(db.labels_, archives) start = update_time(start, 1) end = update_time(start, 3)
def run(): with jsonl.open('../dataset_files/train.jsonl.gz', gzip = True) as file: data = file.read() file.close() print('beginning preprocessing') texts = [] count = 0 with open('preprocessed.csv', 'a') as csvFile: for article in data: #change to write full text = preprocess(article['text']) texts.append(text) writer = csv.writer(csvFile) writer.writerow(text) print(count) count += 1 csvFile.close() print('finished preprocessing') print('begin creating dictionary') dict = create_dictionary(data, texts) print('finished creating dictionary') print('finding tfidf vectors') vectors = tfidf(texts, dict) print('found tfidf vectors') print('writing identifier') write_identifier(data, vectors) print('finished writing identifier')
def main(): articleDict = {} pbar = tqdm(total=len(clusters), desc='Going through Clusters:') with jsonl.open('../clustering/cluster_pairings.jsonl') as writeFile: with Pool(processes=15) as pool: for smallDict in pool.imap(analyzeCluster, range(len(clusters))): writeFile.appendline(smallDict) pbar.update(1)
def main(): with jsonl.open('../clustering/final_clusters_cleaned0.9_2.jsonl') as f: clusters = f.read() with jsonl.open('../dataset_files/train.jsonl.gz', gzip=True) as ds: articles = ds.read() dict = createDictionary(articles) articleDict = {} pbar = tqdm(total=len(clusters), desc='Going through Clusters:') qbar = tqdm(total=70000, desc='Good articles found with >=4 summaries:') with Pool(processes=15) as pool: for smallDict in pool.imap_unordered(analyzeCluster, range(len(clusters))): for key in smallDict: articleDict[key] = smallDict[key] qbar.update(1) pbar.update(1) with open('../clustering/articleSummaryPairsMinLength.json', 'w+') as file: json.dump(articleDict, file)
def remove_cluster(): if request.method == 'POST': cluster_id = request.form['cid'] db.remove_cluster(cluster_id) cluster_list[int(cluster_id)] = {} with jsonl.open('../clustering/cluster_pairings.jsonl') as writeFile: writeFile.write(cluster_list) message = "Cluster Removed" return redirect(url_for('home', message=message))
def group(clusters): fileName = '../clustering/final_clusters_0.9.jsonl' with jsonl.open(fileName) as file: for key in tqdm(clusters, desc='grouping'): if key == '-1': for cluster in clusters[key]: file.appendline(cluster) else: file.appendline(merge(clusters[key])) file.close()
def prepare_data(in_file): """ Writes pairs in the form of title and summary as: <title>\t<summary>\n """ with jsonl.open(in_file, gzip = True) as inp: for entry in inp: # For self-scraped sets, some things may be missing if entry["title"] and entry["summary"]: yield parse(entry["title"]) + "\t" + parse(entry["summary"])
def cluster(): with jsonl.open('../clustering/clusters_0.9.jsonl') as file: windows = file.read() file.close() identifier = get_identifier(True) ind = 2 w2 = windows[ind - 2] w3 = windows[ind - 1] w2length = len(w2) w3length = len(w3) pbar = tqdm(total=len(windows), desc='clustering', initial=2) while ind < len(windows): w1 = w2 w2 = w3 w3 = windows[ind] w1length = w2length w2length = w3length w3length = len(w3) if (len(w1) == 0): ind += 1 pbar.update(1) continue matrix = average(w1, w2, w3, w1length, w2length, w3length, identifier) if matrix.shape[0] == 0: continue db = DBSCAN(eps=0.22, min_samples=2).fit(matrix) labels = db.labels_ dict = {} for x, label in enumerate(labels, start=0): if x < w1length: if not (str(x) in w1): continue if str(label) in dict: dict[str(label)].append(w1[str(x)]) else: dict[str(label)] = [w1[str(x)]] elif x >= w1length and x < w1length + w2length: if not (str(x - w1length) in w2): continue if str(label) in dict and label >= 0: dict[str(label)].append(w2.pop(str(x - w1length))) else: if not (str(x - w1length - w2length) in w3): continue if str(label) in dict and label >= 0: dict[str(label)].append( w3.pop(str(x - w1length - w2length))) group(dict) ind += 1 pbar.update(1) pbar.close()
def summaryDict(): texts = {} with jsonl.open('../dataset_files/train.jsonl.gz', gzip=True) as ds: articles = ds.read() summaries = [] for article in articles: summaries.append((article['summary'], article['archive'])) for summary in tqdm(summaries, desc='summaries added'): try: texts[summary[1]] except: texts[summary[1]] = summary[0] with open('../clustering/summary_dict.json', 'w+') as outfile: json.dump(texts, outfile)
def main(): with jsonl.open( '../clustering/final_clusters_cleaned0.9_2.jsonl') as writeFile: dict = createDictionary() pbar = tqdm(total=len(clusters), desc='Cleaning Clusters:') for cluster in clusters: tList = [] content = "" for article in cluster: if content != dict[article]: content = dict[article] tList.append(article) if len(tList) > 1: writeFile.appendline(tList) else: writeFile.appendline([]) pbar.update(1)
def group(labels, archives): """ Groups the archives of the articles using the labels outputted from clustering. The result is a dictionary with labels as keys and a list of archives in that cluster as the value. Outputs the dictionary into a jsonl file with each line a new cluster. """ fileName = '../clustering/clusters_0.9.jsonl' dict = {} with jsonl.open(fileName) as file: for x in range(len(labels)): if str(labels[x]) != '-1': if str(labels[x]) in dict: dict[str(labels[x])].append(archives[x]) else: dict[str(labels[x])] = [archives[x]] file.appendline(dict) file.close()
def example_generator(data_path, single_pass): """Generates tf.Examples from NEWSROOM dataset data files. Args: data_path: Path to NEWSROOM dataset .data file. single_pass: Boolean. If True, go through the dataset exactly once, generating examples in the order they appear, then return. Otherwise, generate random examples indefinitely. Yields: Deserialized tf.Example. """ while True: filelist = glob.glob(data_path) # get the list of datafiles assert filelist, ('Error: Empty filelist at %s' % data_path ) # check filelist isn't empty if single_pass: filelist = sorted(filelist) else: random.shuffle(filelist) for f in filelist: reader = jsonl.open(f, gzip=True) for entry in reader: curr_text = entry["text"] curr_summ = entry["summary"] if curr_text == None or len( curr_text) == 0 or curr_summ == None or len( curr_summ) == 0: continue curr_summ = SENTENCE_START + " " + curr_summ + " " + SENTENCE_END curr_text = curr_text.encode('ascii', 'ignore') curr_summ = curr_summ.encode('ascii', 'ignore') tf_example = example_pb2.Example() tf_example.features.feature['article'].bytes_list.value.extend( [curr_text]) tf_example.features.feature[ 'abstract'].bytes_list.value.extend([curr_summ]) yield tf_example if single_pass: print( "example_generator completed reading all datafiles. No more data." ) break
def prepare_data_first_sentence(in_file): """ This version is similar to the parsing of DUC from NAMAS. https://github.com/facebookarchive/NAMAS/blob/master/DUC/make_DUC.py <title>\t<first_sentence>\n """ sent_detector = nltk.data.load('tokenizers/punkt/english.pickle') with jsonl.open(in_file, gzip = True) as inp: for entry in inp: # For self-scraped sets, some things may be missing if entry["title"] and entry["text"]: title = parse(entry["title"]) text = entry["text"] sents = sent_detector.tokenize(text) if len(sents) == 0: continue first = sents[0] if len(first) < 130 and len(sents) > 1: first += sents[1] yield parse(entry["title"]) + "\t" + parse(first)
from multiprocessing import Pool from threading import Lock from tqdm import tqdm import nltk from nltk.tokenize import word_tokenize from nltk.tag import pos_tag import json import spacy import wmd nlp = spacy.load('en', create_pipeline=wmd.WMD.create_spacy_pipeline) #nlp = spacy.load('en_core_web_lg') thresholds = [0.65] with jsonl.open('../clustering/final_clusters_cleaned0.9_2.jsonl') as f: clusters = f.read() with jsonl.open('../dataset_files/train.jsonl.gz', gzip=True) as ds: articles = ds.read() def preprocess(sent): sent = nltk.word_tokenize(sent) sent = nltk.pos_tag(sent) return sent def createDictionary(): """Creates dictionary for entire dataset with article archives as keys and (summary, text) as values.""" dict = {} pbar = tqdm(total=len(articles), desc='Generating Dictionary:') for article in articles:
from plot import cdplot, complot, article_cdplot, article_complot import random import io import base64 import sys import nltk from nltk.tokenize import word_tokenize from nltk.tag import pos_tag import jsonl from TextRank4Keyword import TextRank4Keyword #nltk.download("punkt") #nltk.download('averaged_perceptron_tagger') app = Flask(__name__, template_folder='templates') clusters = {} with jsonl.open('../clustering/cluster_pairings.jsonl') as f: cluster_list = f.read() @app.route('/') def index(): return render_template('base.html', last_updated=dir_last_updated('static')) @app.route('/<cluster_id>/<message>') def home(cluster_id, message): return render_template('base.html', last_updated=dir_last_updated('static'), message=message)
from tqdm import tqdm import nltk from nltk.tokenize import word_tokenize from nltk.tag import pos_tag import json def preprocess(sent): sent = nltk.word_tokenize(sent) sent = nltk.pos_tag(sent) return sent with open('../clustering/articleSummaryPairs.json') as f: articleListing = json.load(f) with jsonl.open('../dataset_files/train.jsonl.gz', gzip=True) as ds: articles = ds.read() def createDictionary(): '''creates a dictionary mapping the archive link to (summary, text) tuple''' dict = {} pbar = tqdm(total=len(articles), desc='Generating Dictionary:') for article in articles: dict[article['archive']] = (article['summary'], article['text']) pbar.update(1) return dict def main(): '''prints the good article summary pairs which are left'''
def data_matrix(event): """Creates matrix of article-summary pairs stored as ASData objects""" #enter name of event (string) path = "../events/" + event + ".jsonl" with jsonl.open(path, gzip=False) as train_file: articles = train_file.read() summaries = [] for article in articles: summaries.append(article['summary']) #lists for the indices of valid summaries for each article if event == 'Mandela': summary_lists = [[0, 2, 8], [1, 2], [0, 2], [0, 2, 3, 6, 7], [0, 2, 4, 5, 6, 7, 8], [0, 2, 5, 6, 7], [5, 6], [7], [8]] elif event == 'Orlando': summary_lists = [ [0, 2, 3], #summaries for article 0 [0, 1, 3, 6, 7, 8], #summaries for article 1, etc [1, 2, 3, 6, 7], [0, 2, 3, 4, 7, 8, 9, 10], [0, 2, 3, 4, 7, 8], [1, 3, 4, 5, 8], [0, 2, 3, 4, 6, 7], [0, 2, 3, 4, 7, 8, 9, 10], [3, 8], [0, 2, 3, 4, 7, 8, 9, 10], [0, 2, 9, 10] ] elif event == 'bostonMarathon': summary_lists = [[0, 1, 2, 4, 5, 6, 7, 8], [0, 1, 2, 4, 5, 6, 7, 8], [0, 1, 2, 3, 4, 5, 6, 7, 8], [0, 1, 2, 4, 5, 6, 7, 8], [0, 1, 2, 4, 5, 6, 7, 8], [0, 1, 2, 4, 5, 6, 7, 8], [0, 1, 2, 4, 5, 6, 7, 8], [0, 1, 2, 4, 5, 6, 7, 8], [0, 1, 2, 4, 5, 6, 7, 8]] elif event == 'hurricaneSandy': summary_lists = [[0, 2, 3, 4], [0, 1, 4], [1, 2, 3], [3, 5], [0, 1, 2, 4], [2, 4, 5]] else: raise InputError('You did not input a valid event.') matrix = [] num = 0 for article in articles: text = article['text'] title = article['title'] entries = [] for index in range(len(summaries)): summary = summaries[index] if index in summary_lists[num]: #print('in if') fragments = Fragments(summary, text) obj = ASData(article, summary, title, True, fragments.coverage(), fragments.density(), fragments.compression()) entries.append(obj) else: #print('in else') obj = ASData(article, summary, title, False) entries.append(obj) matrix.append(entries) num += 1 return matrix
if not (summary in cleaned_articles[i][key]): cleaned_articles[i][key].append(summary) else: cleaned_articles[i][key] = summary_sets[i] ''' pbar = tqdm(total=len(list(articles.keys())), desc="articles cleaned") with Pool(processes=15) as pool: for summary_sets, key in pool.imap_unordered(removeSummaries, articles): for i in range(len(summary_sets)): if len(summary_sets[i]) >= 2: for summary in summary_sets[i]: if key in cleaned_articles[i].keys(): if not (summary in cleaned_articles[i][key]): cleaned_articles[i][key].append(summary) else: cleaned_articles[i][key] = summary_sets[i] pbar.update(1) for i in range(len(cleaned_articles)): arts = len(cleaned_articles[i]) print('set: '+str(i)) sums = [] for key in cleaned_articles[i]: sums.append(len(cleaned_articles[i][key])) print('num articles: '+str(arts)+'\nnum summaries: '+str(np.sum(sums))+ '\navg # summaries per article: '+str(np.mean(sums))) with jsonl.open('swms_article_summary_pairs.json', 'w+') as f: f.write(cleaned_articles)