def tfidf(input_path, output_path, overwrite=False): logging.info("TF-IDF requested on document set '%s'...", input_path) if not utility.check_output_necessary(output_path, overwrite): return logging.info("Loading documents...") with open(input_path, 'r') as infile: docs = json.load(infile) corpus = [] for document in tqdm(docs): corpus.append(document["text"]) logging.info("Running TF-IDF...") vectorizer = TfidfVectorizer(max_features=5000) vectorizer.fit(corpus) tfidf_matrix = vectorizer.transform(corpus) #tfidf_matrix = vectorizer.fit_transform(corpus) logging.info("Saving TF-IDF matrix...") with open(output_path, 'w') as outfile: json.dump(tfidf_matrix.todense().tolist(), outfile)
def prune_articles(input_file, output_file, source_threshold=20, word_threshold=500, overwrite=False): logging.info( "Pruned article set requested for webhose dataset at '%s', minimum source count of " + "%i and minimum word count of %i", output_file, source_threshold, word_threshold) # check to see if the output path already exists if not utility.check_output_necessary(output_file, overwrite): return # load the data article_table = pd.read_csv(input_file) # prune by word count article_table = article_table[article_table.apply( lambda x: x['text'].count(' ') > word_threshold, axis=1)] # prune by source article count source_counts = article_table.site.value_counts() acceptable_sources = list( source_counts[source_counts > source_threshold].index) article_table = article_table[article_table.site.isin(acceptable_sources)] logging.info("Pruned to %i articles", article_table.shape[0]) # write it out article_table.to_csv(output_file)
def preprocess(input_folder, output_path, count=-1, overwrite=False): """Run the preprocess process on all documents in dataset.""" logging.info("Preprocessing requested for kaggle1 dataset at '%s'", output_path) # check if output file already exists if not utility.check_output_necessary(output_path, overwrite): return # ensure nltk datasets are present logging.debug("Ensuring nltk sets...") nltk.download("stopwords") nltk.download("wordnet") # load the data logging.info("Loading article data...") logging.debug("Loading articles1.csv...") article_table1 = pd.read_csv(input_folder + "/articles1.csv") logging.debug("Loading articles2.csv...") article_table2 = pd.read_csv(input_folder + "/articles2.csv") logging.debug("Loading articles3.csv...") article_table3 = pd.read_csv(input_folder + "/articles3.csv") article_table = pd.concat([article_table1, article_table2, article_table3]) out = codecs.open(output_path, 'w', 'utf-8') logging.info("Parsing...") bar = IncrementalBar("Parsing", max=len(article_table.index)) counter = 0 for article in article_table.content: if count >= 0 and counter == count: break counter += 1 # get the sentences from this article for sentence in article.split("."): tokens = parse_sentence(sentence) if len(tokens) > 0: out.write(' '.join(tokens) + '\n') out.write(" \n") bar.next() bar.finish() logging.info("Preprocessing completed, output at '%s'", output_path)
def sentencify(input_folder, output_path, count=-1, overwrite=False): """Create a file of sentences from the 3 csv files. A count of -1 means output _all_ sentences. Input_folder assumes no trailling / """ logging.info("Sentence data requested for kaggle1 dataset at '%s'...", output_path) # check to see if the output path already exists if not utility.check_output_necessary(output_path, overwrite): return # load the data logging.info("Loading article data...") logging.debug("Loading articles1.csv...") article_table1 = pd.read_csv(input_folder + "/articles1.csv") logging.debug("Loading articles2.csv...") article_table2 = pd.read_csv(input_folder + "/articles2.csv") logging.debug("Loading articles3.csv...") article_table3 = pd.read_csv(input_folder + "/articles3.csv") article_table = pd.concat([article_table1, article_table2, article_table3]) # split every sentence from every article on '.' logging.info("Splitting articles...") sentences = [] for article in article_table.content: if count != -1 and len(sentences) > count: break sentences.extend(article.split(".")) # chop down to size as needed if count != -1 and len(sentences) > count: sentences = sentences[:count] # write out the file logging.info("Saving sentence data to '%s'", output_path) with open(output_path, 'w') as file_out: for sentence in sentences: file_out.write("{0}\n".format(sentence)) logging.info("Sentence data saved to '%s'", output_path)
def combine(input_file_1, input_file_2, output_file, overwrite=False): logging.info("Combined feature set requested from %s and %s", input_file_1, input_file_2) if not utility.check_output_necessary(output_file, overwrite): return df1 = pd.read_json(input_file_1) df2 = pd.read_json(input_file_2) combined_df = pd.concat([df1, df2], axis=1) logging.info("Saving combined feature set to '%s'", output_file) rows = combined_df.values.tolist() with open(output_file, 'w') as outfile: json.dump(rows, outfile) logging.info("Feature set saved")
def run( input_path, output_path, embed_dim=200, window=5, min_count=10, workers=4, epochs=2, overwrite=False, ): """The primary function for generating the word2vec model.""" print("Yep 2") logging.info("Word2vec model requested for input '%s', output '%s'", input_path, output_path) # check to see if the output path already exists if not utility.check_output_necessary(output_path, overwrite): return logging.info("Running word2vec on '%s', outputting to '%s'...", input_path, output_path) # convert input logging.debug("Converting input into sentences...") sentences = Sentences(input_path) # run the model creation function logging.debug("Generating model...") logging.info("Model params: %i, %i, %i, %i, %i", embed_dim, window, min_count, workers, epochs) model = gensim.models.Word2Vec( sentences, size=embed_dim, window=window, min_count=min_count, workers=workers, sg=1, iter=epochs, ) model.save(output_path) logging.info("Word2Vec model saved to '%s'", output_path)
def tokenize(input_file, output_path, count=-1, overwrite=False): logging.info("Tokenization requested on document set '%s'...", input_file) if not utility.check_output_necessary(output_path + "/pos.json", overwrite): return with open(input_file, 'r') as infile: docs = json.load(infile) if count > 0: logging.info("Selecting document subset of size %i", count) docs = docs[0:count] # make the output path if it doens't exist if not os.path.exists(output_path): os.makedirs(output_path) sentences = [] pos_sentences = [] document_sentences = [] sentence_documents = [] logging.info("Sentencifying documents...") sentence_index_start = 0 doc_index = 0 for doc in tqdm(docs): # tokenize local_sentences = nltk.sent_tokenize(doc['text']) count = len(local_sentences) # add the associated sentence id's to the document sentences document_sentences.append( list(range(sentence_index_start, sentence_index_start + count))) sentence_index_start += count # add the associated document id to the sentence_documents list sentence_documents.extend([doc_index] * count) doc_index += 1 # add the tokenized sentences sentences.extend(local_sentences) logging.info("Tokenizing sentences...") for sentence in tqdm(sentences): # pos tagger words = nltk.word_tokenize(sentence) #words = [word.lower() for word in words if word.isalpha() and word is not "s"] words = [word for word in words if word.isalpha() and word is not "s"] tagged = nltk.pos_tag(words) pos_sentences.append(tagged) #return pos_sentences, sentences, document_sentences, sentence_documents pos_path = output_path + "/pos.json" doc_sent_path = output_path + "/doc_sent.json" sent_doc_path = output_path + "/sent_doc.json" logging.info("Saving tokenization information...") with open(pos_path, 'w') as file_out: json.dump(pos_sentences, file_out) with open(doc_sent_path, 'w') as file_out: json.dump(document_sentences, file_out) with open(sent_doc_path, 'w') as file_out: json.dump(sentence_documents, file_out)
def docify(input_folder, output_path, count=-1, content_column="content", source_column="publication", keywords=[], ignore_source=[], overwrite=False): """Create a file of documents from all csv files in a folder A count of -1 means output _all_ documents. Input_folder assumes no trailling / """ logging.info("document data requested for '%s' dataset at '%s'...", input_folder, output_path) # check to see if the output path already exists if not utility.check_output_necessary(output_path, overwrite): return # load the data logging.info("Loading article data...") article_table = None for filename in tqdm(os.listdir(input_folder)): if filename.endswith(".csv"): logging.debug("Loading '%s'...", filename) article_table_in = pd.read_csv(input_folder + "/" + filename) if article_table is None: article_table = article_table_in else: article_table = pd.concat([article_table, article_table_in]) # remove any ignored sources for source in ignore_source: logging.info("Ignoring source %s...", source) article_table = article_table[article_table[source_column] != source] # target documents to keywords if supplied if len(keywords) > 0 and keywords[0] != "": tmp_table = pd.DataFrame(columns=article_table.columns) logging.info("Targeting document set to keywords %s...", str(keywords)) for word in keywords: tmp_table = pd.concat([ tmp_table, article_table[article_table[content_column].str.contains(word)] ]) tmp_table = tmp_table.drop_duplicates() article_table = tmp_table.copy() # randomly shuffling if count != -1: logging.info("Shuffling %i subset of documents for output...", count) article_table = article_table.sample(count, random_state=42) # TODO: include source and content in output # TODO: randomize if count is less than 0 # TODO: need to save more input parameters in log # split every sentence from every article on '.' logging.info("Grabbing articles...") documents = [] for (idx, row) in tqdm(article_table.iterrows()): if count != -1 and len(documents) > count: break documents.append({ "text": row.loc[content_column], "source": row.loc[source_column] }) # this is literally pointless...the subset has already been taken at this point #if count != -1: #logging.info("Shuffling %i subset of documents for output...", count) #random.shuffle(documents) # write out the file logging.info("Saving document data to '%s'", output_path) with open(output_path, 'w') as outfile: json.dump(documents, outfile) logging.info("document data saved to '%s'", output_path)