Exemple #1
0
def main(e):
    start = '19980101000000'
    end = update_time(start, 3)
    path = '../dataset_files/train.jsonl.gz'
    print('opening file')
    with jsonl.open(path, gzip=True) as file:
        data = file.read()
        file.close()
    print('getting identifier')
    identifier = get_identifier(True)
    print('starting clustering')
    count = 0
    while start < '20180000000000':
        archives = window(data, start, end)
        matrix = get_tfidf(archives, identifier)
        if matrix.shape[0] == 0:
            group([], [])
            start = update_time(start, 1)
            end = update_time(start, 3)
            count += 1
            continue
        db = cluster(matrix, e)
        group(db.labels_, archives)
        start = update_time(start, 1)
        end = update_time(start, 3)
        count += 1
        print(count)
Exemple #2
0
def cluster_sampling():
    """Clusters sample data"""

    start = '20160610000000'
    end = update_time(start, 3)
    path = '../dataset_files/train.jsonl.gz'
    print('opening file')
    with jsonl.open(path, gzip=True) as file:
        data = file.read()
    print('getting identifier')
    identifier = get_identifier(False)
    print('starting clustering')
    while start < '20160615000000':
        archives = window(data, start, end)
        matrix = get_tfidf(archives, identifier)
        if matrix.shape[0] == 0:
            group([], [])
            start = update_time(start, 1)
            end = update_time(start, 3)
            count += 1
            continue
        db = cluster(matrix, 0.93)
        group(db.labels_, archives)
        start = update_time(start, 1)
        end = update_time(start, 3)
Exemple #3
0
def run():
    with jsonl.open('../dataset_files/train.jsonl.gz', gzip = True) as file:
        data = file.read()
        file.close()
    print('beginning preprocessing')
    texts = []
    count = 0
    with open('preprocessed.csv', 'a') as csvFile:
        for article in data:
            #change to write full
            text = preprocess(article['text'])
            texts.append(text)
            writer = csv.writer(csvFile)
            writer.writerow(text)
            print(count)
            count += 1
    csvFile.close()
    print('finished preprocessing')
    print('begin creating dictionary')
    dict = create_dictionary(data, texts)
    print('finished creating dictionary')
    print('finding tfidf vectors')
    vectors = tfidf(texts, dict)
    print('found tfidf vectors')
    print('writing identifier')
    write_identifier(data, vectors)
    print('finished writing identifier')
Exemple #4
0
def main():
    articleDict = {}
    pbar = tqdm(total=len(clusters), desc='Going through Clusters:')
    with jsonl.open('../clustering/cluster_pairings.jsonl') as writeFile:
        with Pool(processes=15) as pool:
            for smallDict in pool.imap(analyzeCluster, range(len(clusters))):
                writeFile.appendline(smallDict)
                pbar.update(1)
Exemple #5
0
def main():
    with jsonl.open('../clustering/final_clusters_cleaned0.9_2.jsonl') as f:
        clusters = f.read()
    with jsonl.open('../dataset_files/train.jsonl.gz', gzip=True) as ds:
        articles = ds.read()

    dict = createDictionary(articles)
    articleDict = {}
    pbar = tqdm(total=len(clusters), desc='Going through Clusters:')
    qbar = tqdm(total=70000, desc='Good articles found with >=4 summaries:')
    with Pool(processes=15) as pool:
        for smallDict in pool.imap_unordered(analyzeCluster, range(len(clusters))):
            for key in smallDict:
                articleDict[key] = smallDict[key]
                qbar.update(1)
            pbar.update(1)
    with open('../clustering/articleSummaryPairsMinLength.json', 'w+') as file:
        json.dump(articleDict, file)
Exemple #6
0
def remove_cluster():
    if request.method == 'POST':
        cluster_id = request.form['cid']
        db.remove_cluster(cluster_id)
        cluster_list[int(cluster_id)] = {}
        with jsonl.open('../clustering/cluster_pairings.jsonl') as writeFile:
            writeFile.write(cluster_list)
        message = "Cluster Removed"
        return redirect(url_for('home', message=message))
Exemple #7
0
def group(clusters):
    fileName = '../clustering/final_clusters_0.9.jsonl'
    with jsonl.open(fileName) as file:
        for key in tqdm(clusters, desc='grouping'):
            if key == '-1':
                for cluster in clusters[key]:
                    file.appendline(cluster)
            else:
                file.appendline(merge(clusters[key]))
    file.close()
Exemple #8
0
def prepare_data(in_file):
    """
    Writes pairs in the form of title and summary as:
    <title>\t<summary>\n
    """
    with jsonl.open(in_file, gzip = True) as inp:
        for entry in inp:
            # For self-scraped sets, some things may be missing
            if entry["title"] and entry["summary"]:
                yield parse(entry["title"]) + "\t" + parse(entry["summary"])
Exemple #9
0
def cluster():
    with jsonl.open('../clustering/clusters_0.9.jsonl') as file:
        windows = file.read()
        file.close()
    identifier = get_identifier(True)

    ind = 2
    w2 = windows[ind - 2]
    w3 = windows[ind - 1]
    w2length = len(w2)
    w3length = len(w3)
    pbar = tqdm(total=len(windows), desc='clustering', initial=2)
    while ind < len(windows):
        w1 = w2
        w2 = w3
        w3 = windows[ind]
        w1length = w2length
        w2length = w3length
        w3length = len(w3)
        if (len(w1) == 0):
            ind += 1
            pbar.update(1)
            continue
        matrix = average(w1, w2, w3, w1length, w2length, w3length, identifier)
        if matrix.shape[0] == 0:
            continue
        db = DBSCAN(eps=0.22, min_samples=2).fit(matrix)
        labels = db.labels_
        dict = {}
        for x, label in enumerate(labels, start=0):
            if x < w1length:
                if not (str(x) in w1):
                    continue
                if str(label) in dict:
                    dict[str(label)].append(w1[str(x)])
                else:
                    dict[str(label)] = [w1[str(x)]]
            elif x >= w1length and x < w1length + w2length:
                if not (str(x - w1length) in w2):
                    continue
                if str(label) in dict and label >= 0:
                    dict[str(label)].append(w2.pop(str(x - w1length)))
            else:
                if not (str(x - w1length - w2length) in w3):
                    continue
                if str(label) in dict and label >= 0:
                    dict[str(label)].append(
                        w3.pop(str(x - w1length - w2length)))
        group(dict)
        ind += 1
        pbar.update(1)

    pbar.close()
def summaryDict():
    texts = {}
    with jsonl.open('../dataset_files/train.jsonl.gz', gzip=True) as ds:
        articles = ds.read()

    summaries = []
    for article in articles:
        summaries.append((article['summary'], article['archive']))

    for summary in tqdm(summaries, desc='summaries added'):
        try:
            texts[summary[1]]
        except:
            texts[summary[1]] = summary[0]

    with open('../clustering/summary_dict.json', 'w+') as outfile:
        json.dump(texts, outfile)
Exemple #11
0
def main():
    with jsonl.open(
            '../clustering/final_clusters_cleaned0.9_2.jsonl') as writeFile:
        dict = createDictionary()
        pbar = tqdm(total=len(clusters), desc='Cleaning Clusters:')
        for cluster in clusters:
            tList = []
            content = ""
            for article in cluster:
                if content != dict[article]:
                    content = dict[article]
                    tList.append(article)
            if len(tList) > 1:
                writeFile.appendline(tList)
            else:
                writeFile.appendline([])
            pbar.update(1)
Exemple #12
0
def group(labels, archives):
    """
    Groups the archives of the articles using the labels outputted from clustering.
    The result is a dictionary with labels as keys and a list of archives in that cluster
    as the value.
    Outputs the dictionary into a jsonl file with each line a new cluster.
    """

    fileName = '../clustering/clusters_0.9.jsonl'
    dict = {}
    with jsonl.open(fileName) as file:
        for x in range(len(labels)):
            if str(labels[x]) != '-1':
                if str(labels[x]) in dict:
                    dict[str(labels[x])].append(archives[x])
                else:
                    dict[str(labels[x])] = [archives[x]]
        file.appendline(dict)
        file.close()
def example_generator(data_path, single_pass):
    """Generates tf.Examples from NEWSROOM dataset data files.

  Args:
    data_path:
      Path to NEWSROOM dataset .data file.
    single_pass:
      Boolean. If True, go through the dataset exactly once, generating examples in the order they appear, then return. Otherwise, generate random examples indefinitely.

  Yields:
    Deserialized tf.Example.
  """
    while True:
        filelist = glob.glob(data_path)  # get the list of datafiles
        assert filelist, ('Error: Empty filelist at %s' % data_path
                          )  # check filelist isn't empty
        if single_pass:
            filelist = sorted(filelist)
        else:
            random.shuffle(filelist)
        for f in filelist:
            reader = jsonl.open(f, gzip=True)
            for entry in reader:
                curr_text = entry["text"]
                curr_summ = entry["summary"]
                if curr_text == None or len(
                        curr_text) == 0 or curr_summ == None or len(
                            curr_summ) == 0:
                    continue
                curr_summ = SENTENCE_START + " " + curr_summ + " " + SENTENCE_END
                curr_text = curr_text.encode('ascii', 'ignore')
                curr_summ = curr_summ.encode('ascii', 'ignore')
                tf_example = example_pb2.Example()
                tf_example.features.feature['article'].bytes_list.value.extend(
                    [curr_text])
                tf_example.features.feature[
                    'abstract'].bytes_list.value.extend([curr_summ])
                yield tf_example
        if single_pass:
            print(
                "example_generator completed reading all datafiles. No more data."
            )
            break
Exemple #14
0
def prepare_data_first_sentence(in_file):
    """
    This version is similar to the parsing of DUC from NAMAS.
    https://github.com/facebookarchive/NAMAS/blob/master/DUC/make_DUC.py
    <title>\t<first_sentence>\n
    """
    sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
    with jsonl.open(in_file, gzip = True) as inp:
        for entry in inp:
            # For self-scraped sets, some things may be missing
            if entry["title"] and entry["text"]:
                title = parse(entry["title"])
                text = entry["text"]
                sents = sent_detector.tokenize(text)
                if len(sents) == 0:
                    continue

                first = sents[0]
                if len(first) < 130 and len(sents) > 1:
                    first += sents[1]

                yield parse(entry["title"]) + "\t" + parse(first)
from multiprocessing import Pool
from threading import Lock
from tqdm import tqdm
import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
import json
import spacy
import wmd

nlp = spacy.load('en', create_pipeline=wmd.WMD.create_spacy_pipeline)
#nlp = spacy.load('en_core_web_lg')

thresholds = [0.65]

with jsonl.open('../clustering/final_clusters_cleaned0.9_2.jsonl') as f:
    clusters = f.read()
with jsonl.open('../dataset_files/train.jsonl.gz', gzip=True) as ds:
    articles = ds.read()

def preprocess(sent):
    sent = nltk.word_tokenize(sent)
    sent = nltk.pos_tag(sent)
    return sent

def createDictionary():
    """Creates dictionary for entire dataset with article archives as keys and
    (summary, text) as values."""
    dict = {}
    pbar = tqdm(total=len(articles), desc='Generating Dictionary:')
    for article in articles:
Exemple #16
0
from plot import cdplot, complot, article_cdplot, article_complot
import random
import io
import base64
import sys
import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
import jsonl
from TextRank4Keyword import TextRank4Keyword
#nltk.download("punkt")
#nltk.download('averaged_perceptron_tagger')

app = Flask(__name__, template_folder='templates')
clusters = {}
with jsonl.open('../clustering/cluster_pairings.jsonl') as f:
    cluster_list = f.read()


@app.route('/')
def index():
    return render_template('base.html',
                           last_updated=dir_last_updated('static'))


@app.route('/<cluster_id>/<message>')
def home(cluster_id, message):
    return render_template('base.html',
                           last_updated=dir_last_updated('static'),
                           message=message)
from tqdm import tqdm
import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
import json


def preprocess(sent):
    sent = nltk.word_tokenize(sent)
    sent = nltk.pos_tag(sent)
    return sent


with open('../clustering/articleSummaryPairs.json') as f:
    articleListing = json.load(f)
with jsonl.open('../dataset_files/train.jsonl.gz', gzip=True) as ds:
    articles = ds.read()


def createDictionary():
    '''creates a dictionary mapping the archive link to (summary, text) tuple'''
    dict = {}
    pbar = tqdm(total=len(articles), desc='Generating Dictionary:')
    for article in articles:
        dict[article['archive']] = (article['summary'], article['text'])
        pbar.update(1)
    return dict


def main():
    '''prints the good article summary pairs which are left'''
Exemple #18
0
def data_matrix(event):
    """Creates matrix of article-summary pairs stored as ASData objects"""

    #enter name of event (string)
    path = "../events/" + event + ".jsonl"
    with jsonl.open(path, gzip=False) as train_file:
        articles = train_file.read()

    summaries = []
    for article in articles:
        summaries.append(article['summary'])

    #lists for the indices of valid summaries for each article
    if event == 'Mandela':
        summary_lists = [[0, 2, 8], [1, 2], [0, 2], [0, 2, 3, 6, 7],
                         [0, 2, 4, 5, 6, 7, 8], [0, 2, 5, 6, 7], [5, 6], [7],
                         [8]]

    elif event == 'Orlando':
        summary_lists = [
            [0, 2, 3],  #summaries for article 0
            [0, 1, 3, 6, 7, 8],  #summaries for article 1, etc
            [1, 2, 3, 6, 7],
            [0, 2, 3, 4, 7, 8, 9, 10],
            [0, 2, 3, 4, 7, 8],
            [1, 3, 4, 5, 8],
            [0, 2, 3, 4, 6, 7],
            [0, 2, 3, 4, 7, 8, 9, 10],
            [3, 8],
            [0, 2, 3, 4, 7, 8, 9, 10],
            [0, 2, 9, 10]
        ]

    elif event == 'bostonMarathon':
        summary_lists = [[0, 1, 2, 4, 5, 6, 7, 8], [0, 1, 2, 4, 5, 6, 7, 8],
                         [0, 1, 2, 3, 4, 5, 6, 7, 8], [0, 1, 2, 4, 5, 6, 7, 8],
                         [0, 1, 2, 4, 5, 6, 7, 8], [0, 1, 2, 4, 5, 6, 7, 8],
                         [0, 1, 2, 4, 5, 6, 7, 8], [0, 1, 2, 4, 5, 6, 7, 8],
                         [0, 1, 2, 4, 5, 6, 7, 8]]

    elif event == 'hurricaneSandy':
        summary_lists = [[0, 2, 3, 4], [0, 1, 4], [1, 2, 3], [3, 5],
                         [0, 1, 2, 4], [2, 4, 5]]

    else:
        raise InputError('You did not input a valid event.')

    matrix = []
    num = 0
    for article in articles:
        text = article['text']
        title = article['title']
        entries = []
        for index in range(len(summaries)):
            summary = summaries[index]
            if index in summary_lists[num]:
                #print('in if')
                fragments = Fragments(summary, text)
                obj = ASData(article, summary, title, True,
                             fragments.coverage(), fragments.density(),
                             fragments.compression())
                entries.append(obj)
            else:
                #print('in else')
                obj = ASData(article, summary, title, False)
                entries.append(obj)
        matrix.append(entries)
        num += 1

    return matrix
Exemple #19
0
                        if not (summary in cleaned_articles[i][key]):
                            cleaned_articles[i][key].append(summary)
                    else:
                        cleaned_articles[i][key] = summary_sets[i]
    '''

    pbar = tqdm(total=len(list(articles.keys())), desc="articles cleaned")
    with Pool(processes=15) as pool:
        for summary_sets, key in pool.imap_unordered(removeSummaries, articles):
            for i in range(len(summary_sets)):
                if len(summary_sets[i]) >= 2:
                    for summary in summary_sets[i]:
                        if key in cleaned_articles[i].keys():
                            if not (summary in cleaned_articles[i][key]):
                                cleaned_articles[i][key].append(summary)
                        else:
                            cleaned_articles[i][key] = summary_sets[i]
            pbar.update(1)

    for i in range(len(cleaned_articles)):
        arts = len(cleaned_articles[i])
        print('set: '+str(i))
        sums = []
        for key in cleaned_articles[i]:
            sums.append(len(cleaned_articles[i][key]))
        print('num articles: '+str(arts)+'\nnum summaries: '+str(np.sum(sums))+
            '\navg # summaries per article: '+str(np.mean(sums)))

    with jsonl.open('swms_article_summary_pairs.json', 'w+') as f:
        f.write(cleaned_articles)