def tfidf(input_path, output_path, overwrite=False):

    logging.info("TF-IDF requested on document set '%s'...", input_path)

    if not utility.check_output_necessary(output_path, overwrite):
        return

    logging.info("Loading documents...")

    with open(input_path, 'r') as infile:
        docs = json.load(infile)

    corpus = []
    for document in tqdm(docs):
        corpus.append(document["text"])

    logging.info("Running TF-IDF...")
    vectorizer = TfidfVectorizer(max_features=5000)
    vectorizer.fit(corpus)
    tfidf_matrix = vectorizer.transform(corpus)
    #tfidf_matrix = vectorizer.fit_transform(corpus)

    logging.info("Saving TF-IDF matrix...")
    with open(output_path, 'w') as outfile:
        json.dump(tfidf_matrix.todense().tolist(), outfile)
Example #2
0
def prune_articles(input_file,
                   output_file,
                   source_threshold=20,
                   word_threshold=500,
                   overwrite=False):
    logging.info(
        "Pruned article set requested for webhose dataset at '%s', minimum source count of "
        + "%i and minimum word count of %i", output_file, source_threshold,
        word_threshold)

    # check to see if the output path already exists
    if not utility.check_output_necessary(output_file, overwrite):
        return

    # load the data
    article_table = pd.read_csv(input_file)

    # prune by word count
    article_table = article_table[article_table.apply(
        lambda x: x['text'].count(' ') > word_threshold, axis=1)]

    # prune by source article count
    source_counts = article_table.site.value_counts()
    acceptable_sources = list(
        source_counts[source_counts > source_threshold].index)
    article_table = article_table[article_table.site.isin(acceptable_sources)]

    logging.info("Pruned to %i articles", article_table.shape[0])

    # write it out
    article_table.to_csv(output_file)
def preprocess(input_folder, output_path, count=-1, overwrite=False):
    """Run the preprocess process on all documents in dataset."""

    logging.info("Preprocessing requested for kaggle1 dataset at '%s'", output_path)

    # check if output file already exists
    if not utility.check_output_necessary(output_path, overwrite):
        return

    # ensure nltk datasets are present
    logging.debug("Ensuring nltk sets...")
    nltk.download("stopwords")
    nltk.download("wordnet")

    # load the data
    logging.info("Loading article data...")
    logging.debug("Loading articles1.csv...")
    article_table1 = pd.read_csv(input_folder + "/articles1.csv")
    logging.debug("Loading articles2.csv...")
    article_table2 = pd.read_csv(input_folder + "/articles2.csv")
    logging.debug("Loading articles3.csv...")
    article_table3 = pd.read_csv(input_folder + "/articles3.csv")

    article_table = pd.concat([article_table1, article_table2, article_table3])

    out = codecs.open(output_path, 'w', 'utf-8')

    logging.info("Parsing...")
    bar = IncrementalBar("Parsing", max=len(article_table.index))
    counter = 0
    for article in article_table.content:
        if count >= 0 and counter == count:
            break
        counter += 1

        # get the sentences from this article
        for sentence in article.split("."):
            tokens = parse_sentence(sentence)
            if len(tokens) > 0:
                out.write(' '.join(tokens) + '\n')
            out.write(" \n")
        bar.next()
    bar.finish()

    logging.info("Preprocessing completed, output at '%s'", output_path)
def sentencify(input_folder, output_path, count=-1, overwrite=False):
    """Create a file of sentences from the 3 csv files.

    A count of -1 means output _all_ sentences.
    Input_folder assumes no trailling /
    """

    logging.info("Sentence data requested for kaggle1 dataset at '%s'...",
                 output_path)

    # check to see if the output path already exists
    if not utility.check_output_necessary(output_path, overwrite):
        return

    # load the data
    logging.info("Loading article data...")
    logging.debug("Loading articles1.csv...")
    article_table1 = pd.read_csv(input_folder + "/articles1.csv")
    logging.debug("Loading articles2.csv...")
    article_table2 = pd.read_csv(input_folder + "/articles2.csv")
    logging.debug("Loading articles3.csv...")
    article_table3 = pd.read_csv(input_folder + "/articles3.csv")

    article_table = pd.concat([article_table1, article_table2, article_table3])

    # split every sentence from every article on '.'
    logging.info("Splitting articles...")
    sentences = []
    for article in article_table.content:
        if count != -1 and len(sentences) > count:
            break

        sentences.extend(article.split("."))

    # chop down to size as needed
    if count != -1 and len(sentences) > count:
        sentences = sentences[:count]

    # write out the file
    logging.info("Saving sentence data to '%s'", output_path)
    with open(output_path, 'w') as file_out:
        for sentence in sentences:
            file_out.write("{0}\n".format(sentence))

    logging.info("Sentence data saved to '%s'", output_path)
def combine(input_file_1, input_file_2, output_file, overwrite=False):
    logging.info("Combined feature set requested from %s and %s", input_file_1, input_file_2)

    if not utility.check_output_necessary(output_file, overwrite):
        return

    df1 = pd.read_json(input_file_1)
    df2 = pd.read_json(input_file_2)

    combined_df = pd.concat([df1, df2], axis=1)

    logging.info("Saving combined feature set to '%s'", output_file)

    rows = combined_df.values.tolist()
    with open(output_file, 'w') as outfile:
        json.dump(rows, outfile)

    logging.info("Feature set saved")
def run(
    input_path,
    output_path,
    embed_dim=200,
    window=5,
    min_count=10,
    workers=4,
    epochs=2,
    overwrite=False,
):
    """The primary function for generating the word2vec model."""
    print("Yep 2")

    logging.info("Word2vec model requested for input '%s', output '%s'", input_path, output_path)

    # check to see if the output path already exists
    if not utility.check_output_necessary(output_path, overwrite):
        return

    logging.info("Running word2vec on '%s', outputting to '%s'...", input_path, output_path)

    # convert input
    logging.debug("Converting input into sentences...")
    sentences = Sentences(input_path)

    # run the model creation function
    logging.debug("Generating model...")
    logging.info("Model params: %i, %i, %i, %i, %i", embed_dim, window, min_count, workers, epochs)
    model = gensim.models.Word2Vec(
        sentences,
        size=embed_dim,
        window=window,
        min_count=min_count,
        workers=workers,
        sg=1,
        iter=epochs,
    )
    model.save(output_path)

    logging.info("Word2Vec model saved to '%s'", output_path)
def tokenize(input_file, output_path, count=-1, overwrite=False):
    logging.info("Tokenization requested on document set '%s'...", input_file)

    if not utility.check_output_necessary(output_path + "/pos.json",
                                          overwrite):
        return

    with open(input_file, 'r') as infile:
        docs = json.load(infile)

    if count > 0:
        logging.info("Selecting document subset of size %i", count)
        docs = docs[0:count]

    # make the output path if it doens't exist
    if not os.path.exists(output_path):
        os.makedirs(output_path)

    sentences = []
    pos_sentences = []

    document_sentences = []
    sentence_documents = []

    logging.info("Sentencifying documents...")
    sentence_index_start = 0
    doc_index = 0
    for doc in tqdm(docs):

        # tokenize
        local_sentences = nltk.sent_tokenize(doc['text'])
        count = len(local_sentences)

        # add the associated sentence id's to the document sentences
        document_sentences.append(
            list(range(sentence_index_start, sentence_index_start + count)))
        sentence_index_start += count

        # add the associated document id to the sentence_documents list
        sentence_documents.extend([doc_index] * count)
        doc_index += 1

        # add the tokenized sentences
        sentences.extend(local_sentences)

    logging.info("Tokenizing sentences...")
    for sentence in tqdm(sentences):
        # pos tagger
        words = nltk.word_tokenize(sentence)
        #words = [word.lower() for word in words if word.isalpha() and word is not "s"]
        words = [word for word in words if word.isalpha() and word is not "s"]
        tagged = nltk.pos_tag(words)
        pos_sentences.append(tagged)

    #return pos_sentences, sentences, document_sentences, sentence_documents

    pos_path = output_path + "/pos.json"
    doc_sent_path = output_path + "/doc_sent.json"
    sent_doc_path = output_path + "/sent_doc.json"
    logging.info("Saving tokenization information...")
    with open(pos_path, 'w') as file_out:
        json.dump(pos_sentences, file_out)
    with open(doc_sent_path, 'w') as file_out:
        json.dump(document_sentences, file_out)
    with open(sent_doc_path, 'w') as file_out:
        json.dump(sentence_documents, file_out)
def docify(input_folder,
           output_path,
           count=-1,
           content_column="content",
           source_column="publication",
           keywords=[],
           ignore_source=[],
           overwrite=False):
    """Create a file of documents from all csv files in a folder

    A count of -1 means output _all_ documents.
    Input_folder assumes no trailling /
    """

    logging.info("document data requested for '%s' dataset at '%s'...",
                 input_folder, output_path)

    # check to see if the output path already exists
    if not utility.check_output_necessary(output_path, overwrite):
        return

    # load the data
    logging.info("Loading article data...")
    article_table = None

    for filename in tqdm(os.listdir(input_folder)):
        if filename.endswith(".csv"):
            logging.debug("Loading '%s'...", filename)
            article_table_in = pd.read_csv(input_folder + "/" + filename)
            if article_table is None:
                article_table = article_table_in
            else:
                article_table = pd.concat([article_table, article_table_in])

    # remove any ignored sources
    for source in ignore_source:
        logging.info("Ignoring source %s...", source)
        article_table = article_table[article_table[source_column] != source]

    # target documents to keywords if supplied
    if len(keywords) > 0 and keywords[0] != "":
        tmp_table = pd.DataFrame(columns=article_table.columns)
        logging.info("Targeting document set to keywords %s...", str(keywords))
        for word in keywords:
            tmp_table = pd.concat([
                tmp_table,
                article_table[article_table[content_column].str.contains(word)]
            ])

        tmp_table = tmp_table.drop_duplicates()
        article_table = tmp_table.copy()

    # randomly shuffling
    if count != -1:
        logging.info("Shuffling %i subset of documents for output...", count)
        article_table = article_table.sample(count, random_state=42)

    # TODO: include source and content in output
    # TODO: randomize if count is less than 0
    # TODO: need to save more input parameters in log

    # split every sentence from every article on '.'
    logging.info("Grabbing articles...")
    documents = []
    for (idx, row) in tqdm(article_table.iterrows()):
        if count != -1 and len(documents) > count:
            break

        documents.append({
            "text": row.loc[content_column],
            "source": row.loc[source_column]
        })

    # this is literally pointless...the subset has already been taken at this point
    #if count != -1:
    #logging.info("Shuffling %i subset of documents for output...", count)
    #random.shuffle(documents)

    # write out the file
    logging.info("Saving document data to '%s'", output_path)
    with open(output_path, 'w') as outfile:
        json.dump(documents, outfile)

    logging.info("document data saved to '%s'", output_path)