def get_topic_set(file_path): """ Opens one of the topic set resource files and returns a set of topics. - Input: - file_path: The path pointing to the topic set resource file. - Output: - topic_set: A python set of strings. """ topic_set = set() file_row_gen = get_file_row_generator(file_path, ",") # The separator here is irrelevant. for file_row in file_row_gen: topic_set.add(file_row[0]) return topic_set
def read_matlab_features(array_paths, number_of_nodes, dimensionality): """ Returns a sparse feature matrix as calculated by a Matlab routine. """ # Read the data array file_row_gen = get_file_row_generator(array_paths[0], "\t") data = list() append_data = data.append for file_row in file_row_gen: append_data(float(file_row[0])) # Read the row array file_row_gen = get_file_row_generator(array_paths[1], "\t") row = list() append_row = row.append for file_row in file_row_gen: append_row(int(float(file_row[0]))) # Read the data array file_row_gen = get_file_row_generator(array_paths[2], "\t") col = list() append_col = col.append for file_row in file_row_gen: append_col(int(float(file_row[0]))) data = np.array(data).astype(np.float64) row = np.array(row).astype(np.int64) - 1 # Due to Matlab numbering col = np.array(col).astype(np.int64) - 1 # Due to Matlab numbering print(np.max(row), np.min(row)) print(np.max(col), np.min(col)) # centroids_new = sparse.coo_matrix((data, (row, col)), shape=(number_of_nodes + 1, k)) features = spsp.coo_matrix((data, (row, col)), shape=(number_of_nodes, dimensionality)) return features
def read_deepwalk_features(deepwalk_folder, number_of_nodes=None): file_row_gen = get_file_row_generator(deepwalk_folder + "/deepwalk.txt", " ") first_row = next(file_row_gen) if number_of_nodes is not None: features = np.zeros((number_of_nodes, int(first_row[1])), dtype=np.float64) else: features = np.zeros((int(first_row[0]), int(first_row[1])), dtype=np.float64) for file_row in file_row_gen: node = int(file_row[0]) - 1 features[node, :] = np.array([np.float64(coordinate) for coordinate in file_row[1:]]) return features
def read_matlab_features(array_paths, number_of_nodes, dimensionality): """ Returns a sparse feature matrix as calculated by a Matlab routine. """ # Read the data array file_row_gen = get_file_row_generator(array_paths[0], "\t") data = list() append_data = data.append for file_row in file_row_gen: append_data(float(file_row[0])) # Read the row array file_row_gen = get_file_row_generator(array_paths[1], "\t") row = list() append_row = row.append for file_row in file_row_gen: append_row(int(float(file_row[0]))) # Read the data array file_row_gen = get_file_row_generator(array_paths[2], "\t") col = list() append_col = col.append for file_row in file_row_gen: append_col(int(float(file_row[0]))) data = np.array(data).astype(np.float64) row = np.array(row).astype(np.int64) - 1 # Due to Matlab numbering col = np.array(col).astype(np.int64) - 1 # Due to Matlab numbering print(np.max(row), np.min(row)) print(np.max(col), np.min(col)) # centroids_new = sparse.coo_matrix((data, (row, col)), shape=(number_of_nodes + 1, k)) features = spsp.coo_matrix((data, (row, col)), shape=(number_of_nodes, dimensionality)) return features
def get_topic_keyword_dictionary(): """ Opens the topic-keyword map resource file and returns the corresponding python dictionary. - Input: - file_path: The path pointing to the topic-keyword map resource file. - Output: - topic_set: A topic to keyword python dictionary. """ topic_keyword_dictionary = dict() file_row_gen = get_file_row_generator(get_package_path() + "/twitter/res/topics/topic_keyword_mapping" + ".txt", ",", "utf-8") for file_row in file_row_gen: topic_keyword_dictionary[file_row[0]] = set([keyword for keyword in file_row[1:]]) return topic_keyword_dictionary
def read_dense_separated_value_file(file_path, number_of_nodes, separator=","): file_row_gen = get_file_row_generator(file_path=file_path, separator=separator) first_file_row = next(file_row_gen) number_of_dimensions = len(first_file_row) features = np.empty((number_of_nodes, number_of_dimensions), dtype=np.float64) file_row_counter = 0 features[file_row_counter, :] = np.array(first_file_row) for file_row in file_row_gen: file_row_counter += 1 features[file_row_counter, :] = np.array(file_row) return features
def read_dense_separated_value_file(file_path, number_of_nodes, separator=","): file_row_gen = get_file_row_generator(file_path=file_path, separator=separator) first_file_row = next(file_row_gen) number_of_dimensions = len(first_file_row) features = np.empty((number_of_nodes, number_of_dimensions), dtype=np.float64) file_row_counter = 0 features[file_row_counter, :] = np.array(first_file_row) for file_row in file_row_gen: file_row_counter += 1 features[file_row_counter, :] = np.array(file_row) return features
def read_deepwalk_features(deepwalk_folder, number_of_nodes=None): file_row_gen = get_file_row_generator(deepwalk_folder + "/deepwalk.txt", " ") first_row = next(file_row_gen) if number_of_nodes is not None: features = np.zeros((number_of_nodes, int(first_row[1])), dtype=np.float64) else: features = np.zeros((int(first_row[0]), int(first_row[1])), dtype=np.float64) for file_row in file_row_gen: node = int(file_row[0]) - 1 features[node, :] = np.array( [np.float64(coordinate) for coordinate in file_row[1:]]) return features
def get_stopset(): stopset = set(stopwords.words('english')) # Make set for faster access more_stopword_files_list = os.listdir(get_package_path() + "/text/res/stopwords/") more_stopword_files_list = (get_package_path() + "/text/res/stopwords/" + file_name for file_name in more_stopword_files_list) # Read more stopwords from files extended_stopset = list() append_stopwords = extended_stopset.append for stop_word_file in more_stopword_files_list: file_row_gen = get_file_row_generator(stop_word_file, ",", encoding="utf-8") for row in file_row_gen: append_stopwords(row[0]) stopset.update(extended_stopset) return stopset
def clean_document(document, lemmatizing="wordnet"): """ Extracts a clean bag-of-words from a document. Inputs: - document: A string containing some text. - lemmatizing: A string containing one of the following: "porter", "snowball" or "wordnet". Output: - lemma_list: A python list of lemmas or stems. - lemma_to_keywordbag: A python dictionary that maps stems/lemmas to original topic keywords. """ #################################################################################################################### # Tokenizing text #################################################################################################################### try: tokenized_document = word_tokenize(document) except LookupError: print("Warning: Could not tokenize document. If these warnings are commonplace, there is a problem with the nltk resources.") lemma_list = list() lemma_to_keywordbag = defaultdict(lambda: defaultdict(int)) return lemma_list, lemma_to_keywordbag #################################################################################################################### # Separate ["camelCase"] into ["camel", "case"] and make every letter lower case #################################################################################################################### tokenized_document = [separate_camel_case(token).lower() for token in tokenized_document] #################################################################################################################### # Parts of speech tagger #################################################################################################################### tokenized_document = nltk.pos_tag(tokenized_document) tokenized_document = [token[0] for token in tokenized_document if (token[1] == "JJ" or token[1] == "NN" or token[1] == "NNS" or token[1] == "NNP")] #################################################################################################################### # Removing digits, punctuation and whitespace #################################################################################################################### # See documentation here: http://docs.python.org/2/library/string.html regex = re.compile('[%s]' % re.escape(string.digits + string.punctuation + string.whitespace)) tokenized_document_no_punctuation = list() append_token = tokenized_document_no_punctuation.append for token in tokenized_document: new_token = regex.sub(u'', token) if not new_token == u'': append_token(new_token) #################################################################################################################### # Removing stopwords #################################################################################################################### stopset = set(stopwords.words('english')) # Make set for faster access more_stopword_files_list = os.listdir(get_package_path() + "/text/res/stopwords/") more_stopword_files_list = (get_package_path() + "/text/res/stopwords/" + file_name for file_name in more_stopword_files_list) # Read more stopwords from files extended_stopset = list() append_stopwords = extended_stopset.append for stop_word_file in more_stopword_files_list: file_row_gen = get_file_row_generator(stop_word_file, ",", encoding="utf-8") for row in file_row_gen: append_stopwords(row[0]) stopset.update(extended_stopset) tokenized_document_no_stopwords = list() append_word = tokenized_document_no_stopwords.append for word in tokenized_document_no_punctuation: if word not in stopset: append_word(word) #################################################################################################################### # Remove words that have been created by automated list tools. #################################################################################################################### # # TODO: This should be done either for list keywords, or with a regex test(0-9), descr(0-9). # tokenized_document_no_stopwords_no_autowords = list() # append_word = tokenized_document_no_stopwords_no_autowords.append # for word in tokenized_document_no_stopwords: # if not word.startswith(prefix=autoword_tuple): # append_word(word) #################################################################################################################### # Stemming and Lemmatizing #################################################################################################################### lemma_to_keywordbag = defaultdict(lambda: defaultdict(int)) final_doc = list() append_lemma = final_doc.append for word in tokenized_document_no_stopwords: if lemmatizing == "porter": porter = PorterStemmer() stem = porter.stem(word) append_lemma(stem) lemma_to_keywordbag[stem][word] += 1 elif lemmatizing == "snowball": snowball = SnowballStemmer('english') stem = snowball.stem(word) append_lemma(stem) lemma_to_keywordbag[stem][word] += 1 elif lemmatizing == "wordnet": wordnet = WordNetLemmatizer() lemma = wordnet.lemmatize(word) append_lemma(lemma) lemma_to_keywordbag[lemma][word] += 1 else: print("Invalid lemmatizer argument.") raise RuntimeError #################################################################################################################### # One more stopword removal #################################################################################################################### lemma_list = list() append_word = lemma_list.append for word in final_doc: if word not in stopset: append_word(word) return lemma_list, lemma_to_keywordbag