def get_topic_set(file_path):
    """
    Opens one of the topic set resource files and returns a set of topics.

    - Input:  - file_path: The path pointing to the topic set resource file.

    - Output: - topic_set: A python set of strings.
    """
    topic_set = set()
    file_row_gen = get_file_row_generator(file_path, ",")  # The separator here is irrelevant.
    for file_row in file_row_gen:
        topic_set.add(file_row[0])

    return topic_set
Example #2
0
def read_matlab_features(array_paths, number_of_nodes, dimensionality):
    """
    Returns a sparse feature matrix as calculated by a Matlab routine.
    """
    # Read the data array
    file_row_gen = get_file_row_generator(array_paths[0], "\t")
    data = list()
    append_data = data.append
    for file_row in file_row_gen:
        append_data(float(file_row[0]))

    # Read the row array
    file_row_gen = get_file_row_generator(array_paths[1], "\t")
    row = list()
    append_row = row.append
    for file_row in file_row_gen:
        append_row(int(float(file_row[0])))

    # Read the data array
    file_row_gen = get_file_row_generator(array_paths[2], "\t")
    col = list()
    append_col = col.append
    for file_row in file_row_gen:
        append_col(int(float(file_row[0])))

    data = np.array(data).astype(np.float64)
    row = np.array(row).astype(np.int64) - 1  # Due to Matlab numbering
    col = np.array(col).astype(np.int64) - 1  # Due to Matlab numbering

    print(np.max(row), np.min(row))
    print(np.max(col), np.min(col))

    # centroids_new = sparse.coo_matrix((data, (row, col)), shape=(number_of_nodes + 1, k))
    features = spsp.coo_matrix((data, (row, col)),
                               shape=(number_of_nodes, dimensionality))

    return features
def read_deepwalk_features(deepwalk_folder, number_of_nodes=None):
    file_row_gen = get_file_row_generator(deepwalk_folder + "/deepwalk.txt", " ")

    first_row = next(file_row_gen)

    if number_of_nodes is not None:
        features = np.zeros((number_of_nodes, int(first_row[1])), dtype=np.float64)
    else:
        features = np.zeros((int(first_row[0]), int(first_row[1])), dtype=np.float64)

    for file_row in file_row_gen:
        node = int(file_row[0]) - 1
        features[node, :] = np.array([np.float64(coordinate) for coordinate in file_row[1:]])

    return features
def read_matlab_features(array_paths, number_of_nodes, dimensionality):
    """
    Returns a sparse feature matrix as calculated by a Matlab routine.
    """
    # Read the data array
    file_row_gen = get_file_row_generator(array_paths[0], "\t")
    data = list()
    append_data = data.append
    for file_row in file_row_gen:
        append_data(float(file_row[0]))

    # Read the row array
    file_row_gen = get_file_row_generator(array_paths[1], "\t")
    row = list()
    append_row = row.append
    for file_row in file_row_gen:
        append_row(int(float(file_row[0])))

    # Read the data array
    file_row_gen = get_file_row_generator(array_paths[2], "\t")
    col = list()
    append_col = col.append
    for file_row in file_row_gen:
        append_col(int(float(file_row[0])))

    data = np.array(data).astype(np.float64)
    row = np.array(row).astype(np.int64) - 1  # Due to Matlab numbering
    col = np.array(col).astype(np.int64) - 1  # Due to Matlab numbering

    print(np.max(row), np.min(row))
    print(np.max(col), np.min(col))

    # centroids_new = sparse.coo_matrix((data, (row, col)), shape=(number_of_nodes + 1, k))
    features = spsp.coo_matrix((data, (row, col)), shape=(number_of_nodes, dimensionality))

    return features
def get_topic_keyword_dictionary():
    """
    Opens the topic-keyword map resource file and returns the corresponding python dictionary.

    - Input:  - file_path: The path pointing to the topic-keyword map resource file.

    - Output: - topic_set: A topic to keyword python dictionary.
    """
    topic_keyword_dictionary = dict()
    file_row_gen = get_file_row_generator(get_package_path() + "/twitter/res/topics/topic_keyword_mapping" + ".txt",
                                          ",",
                                          "utf-8")
    for file_row in file_row_gen:
        topic_keyword_dictionary[file_row[0]] = set([keyword for keyword in file_row[1:]])

    return topic_keyword_dictionary
def read_dense_separated_value_file(file_path, number_of_nodes, separator=","):

    file_row_gen = get_file_row_generator(file_path=file_path, separator=separator)

    first_file_row = next(file_row_gen)
    number_of_dimensions = len(first_file_row)

    features = np.empty((number_of_nodes, number_of_dimensions), dtype=np.float64)

    file_row_counter = 0
    features[file_row_counter, :] = np.array(first_file_row)

    for file_row in file_row_gen:
        file_row_counter += 1
        features[file_row_counter, :] = np.array(file_row)

    return features
Example #7
0
def read_dense_separated_value_file(file_path, number_of_nodes, separator=","):

    file_row_gen = get_file_row_generator(file_path=file_path,
                                          separator=separator)

    first_file_row = next(file_row_gen)
    number_of_dimensions = len(first_file_row)

    features = np.empty((number_of_nodes, number_of_dimensions),
                        dtype=np.float64)

    file_row_counter = 0
    features[file_row_counter, :] = np.array(first_file_row)

    for file_row in file_row_gen:
        file_row_counter += 1
        features[file_row_counter, :] = np.array(file_row)

    return features
Example #8
0
def read_deepwalk_features(deepwalk_folder, number_of_nodes=None):
    file_row_gen = get_file_row_generator(deepwalk_folder + "/deepwalk.txt",
                                          " ")

    first_row = next(file_row_gen)

    if number_of_nodes is not None:
        features = np.zeros((number_of_nodes, int(first_row[1])),
                            dtype=np.float64)
    else:
        features = np.zeros((int(first_row[0]), int(first_row[1])),
                            dtype=np.float64)

    for file_row in file_row_gen:
        node = int(file_row[0]) - 1
        features[node, :] = np.array(
            [np.float64(coordinate) for coordinate in file_row[1:]])

    return features
def get_stopset():
    stopset = set(stopwords.words('english'))  # Make set for faster access

    more_stopword_files_list = os.listdir(get_package_path() +
                                          "/text/res/stopwords/")
    more_stopword_files_list = (get_package_path() + "/text/res/stopwords/" +
                                file_name
                                for file_name in more_stopword_files_list)

    # Read more stopwords from files
    extended_stopset = list()
    append_stopwords = extended_stopset.append
    for stop_word_file in more_stopword_files_list:
        file_row_gen = get_file_row_generator(stop_word_file,
                                              ",",
                                              encoding="utf-8")
        for row in file_row_gen:
            append_stopwords(row[0])
        stopset.update(extended_stopset)
    return stopset
def clean_document(document, lemmatizing="wordnet"):
    """
    Extracts a clean bag-of-words from a document.

    Inputs: - document: A string containing some text.
            - lemmatizing: A string containing one of the following: "porter", "snowball" or "wordnet".

    Output: - lemma_list: A python list of lemmas or stems.
            - lemma_to_keywordbag: A python dictionary that maps stems/lemmas to original topic keywords.
    """
    ####################################################################################################################
    # Tokenizing text
    ####################################################################################################################
    try:
        tokenized_document = word_tokenize(document)
    except LookupError:
        print("Warning: Could not tokenize document. If these warnings are commonplace, there is a problem with the nltk resources.")
        lemma_list = list()
        lemma_to_keywordbag = defaultdict(lambda: defaultdict(int))
        return lemma_list, lemma_to_keywordbag

    ####################################################################################################################
    # Separate ["camelCase"] into ["camel", "case"] and make every letter lower case
    ####################################################################################################################
    tokenized_document = [separate_camel_case(token).lower() for token in tokenized_document]

    ####################################################################################################################
    # Parts of speech tagger
    ####################################################################################################################
    tokenized_document = nltk.pos_tag(tokenized_document)
    tokenized_document = [token[0] for token in tokenized_document if (token[1] == "JJ" or token[1] == "NN" or token[1] == "NNS" or token[1] == "NNP")]

    ####################################################################################################################
    # Removing digits, punctuation and whitespace
    ####################################################################################################################
    # See documentation here: http://docs.python.org/2/library/string.html
    regex = re.compile('[%s]' % re.escape(string.digits + string.punctuation + string.whitespace))

    tokenized_document_no_punctuation = list()
    append_token = tokenized_document_no_punctuation.append
    for token in tokenized_document:
        new_token = regex.sub(u'', token)
        if not new_token == u'':
            append_token(new_token)

    ####################################################################################################################
    # Removing stopwords
    ####################################################################################################################
    stopset = set(stopwords.words('english'))  # Make set for faster access

    more_stopword_files_list = os.listdir(get_package_path() + "/text/res/stopwords/")
    more_stopword_files_list = (get_package_path() + "/text/res/stopwords/" + file_name for file_name in more_stopword_files_list)

    # Read more stopwords from files
    extended_stopset = list()
    append_stopwords = extended_stopset.append
    for stop_word_file in more_stopword_files_list:
        file_row_gen = get_file_row_generator(stop_word_file, ",", encoding="utf-8")
        for row in file_row_gen:
            append_stopwords(row[0])
        stopset.update(extended_stopset)

    tokenized_document_no_stopwords = list()
    append_word = tokenized_document_no_stopwords.append
    for word in tokenized_document_no_punctuation:
        if word not in stopset:
            append_word(word)

    ####################################################################################################################
    # Remove words that have been created by automated list tools.
    ####################################################################################################################
    # # TODO: This should be done either for list keywords, or with a regex test(0-9), descr(0-9).
    # tokenized_document_no_stopwords_no_autowords = list()
    # append_word = tokenized_document_no_stopwords_no_autowords.append
    # for word in tokenized_document_no_stopwords:
    #     if not word.startswith(prefix=autoword_tuple):
    #         append_word(word)

    ####################################################################################################################
    # Stemming and Lemmatizing
    ####################################################################################################################
    lemma_to_keywordbag = defaultdict(lambda: defaultdict(int))

    final_doc = list()
    append_lemma = final_doc.append
    for word in tokenized_document_no_stopwords:
        if lemmatizing == "porter":
            porter = PorterStemmer()
            stem = porter.stem(word)
            append_lemma(stem)
            lemma_to_keywordbag[stem][word] += 1
        elif lemmatizing == "snowball":
            snowball = SnowballStemmer('english')
            stem = snowball.stem(word)
            append_lemma(stem)
            lemma_to_keywordbag[stem][word] += 1
        elif lemmatizing == "wordnet":
            wordnet = WordNetLemmatizer()
            lemma = wordnet.lemmatize(word)
            append_lemma(lemma)
            lemma_to_keywordbag[lemma][word] += 1
        else:
            print("Invalid lemmatizer argument.")
            raise RuntimeError

    ####################################################################################################################
    # One more stopword removal
    ####################################################################################################################
    lemma_list = list()
    append_word = lemma_list.append
    for word in final_doc:
        if word not in stopset:
            append_word(word)

    return lemma_list, lemma_to_keywordbag