Ejemplo n.º 1
0
white_words = []
with open("./input/whitelist.txt", encoding="utf-8") as file:
    for line in file:
        line = line.strip()
        white_words.append(line)

# Read our list of multi words
multi_words = []
with open("./input/multiwords.txt", encoding="utf-8") as file:
    for line in file:
        line.strip()
        multi_words.append(line)

# Add multiwords to tokenizer
for mw in multi_words:
    multi_word_tokenizer.add_mwe(tuple(mw.strip().split(" ")))  # Needs a tuple

# All stop words
all_stopwords = set(stopwords.words('german')) | set(stop_words)

# Check if directories exist, otherwise generate them
if not os.path.exists("./output"):
    os.makedirs("./output")
if not os.path.exists("./dbs"):
    os.makedirs("./dbs")

# Open file for write
csvfile = open("./output/frequency.csv", "w", newline="", encoding="utf-8")
wr = csv.writer(csvfile, quoting=csv.QUOTE_NONE, delimiter=';')

# Open stop_word file for write
Ejemplo n.º 2
0
class TextAnalysor:
    '''
	Text

	This is a class for processing the raw text information, 
	extracting useful features and also providing user-friendly data API.

	Essential public attributes:
	* dt_matrix: documents-term (documents-feature) matrix
	* labels:    labels for each of the documents
	'''
    INI_PATH = 'conf/text.ini'
    WORD_MIN_LEN = 2
    ANCHOR_MIN_SIM = 0.5
    PHRASE_MIN_SIM = 0.8

    def __init__(self):
        print >> sys.stderr, '[TEXT]\t%s\t*** Initializing Text Object ***' % arrow.now(
        )
        # Read Configuration from ini file
        conf = Config(self.INI_PATH)
        phrases_extractor_path = conf.config_section_map('Model')['n_gram']
        word2vec_model_path = conf.config_section_map('Model')['word2vec']
        words_category_path = conf.config_section_map('Corpus')['key_words']

        # Variable initialization
        # - key words and their related words
        self.words_category = None
        with open(words_category_path, 'rb') as f:
            self.words_category = json.load(f)
        # - all of the related words in the words_category
        print >> sys.stderr, '[TEXT]\t%s\tLoading n-Gram model ...' % arrow.now(
        )
        self.interested_phrases = list(
            set([
                item for sublist in self.words_category.values()  # Get sublist
                for item in sublist  # Merge sublist
                if isPhrase(item)  # Filter non phrases
            ]))
        # - word2vec model
        print >> sys.stderr, '[TEXT]\t%s\tLoading word2vec model ...' % arrow.now(
        )
        self.word2vec_model = Word2Vec.load_word2vec_format(
            word2vec_model_path, binary=True)
        print >> sys.stderr, '[TEXT]\t'
        # - phrases extractor (n-gram kernel)
        self.phrases_extractor = PhrasesExtractor(
            phrases_extractor_path, interested_phrases=self.interested_phrases)
        # - MWE Tokenizer
        self.mwe = MWETokenizer()
        # Init words analysor
        self.words_analysor = WordsAnalysor()
        # Document-Term Vectors
        self.dt_matrix = []
        # Labels for documents
        self.labels = []

    def save_variables(self, file_path):
        '''
		SAVE VARIABLES

		This method would save the text analysor in two files, one is 
		a .npy file stores the documents-term matrix, and the other one
		is a text file stores the labels.  
		'''
        # Save the document-term matrix
        np.save(file_path, self.dt_matrix)
        # Save the labels information
        labels = [
            '#'.join(multiple_labels) + '\n' for multiple_labels in self.labels
        ]
        with open(file_path + '.txt', 'w') as f:
            try:
                f.writelines(labels)
            except:
                print >> sys.stderr, '[ERROR] Saving failed. Invalid file path: %s' % file_path

    def load_variables(self, file_path):
        '''
		LOAD VARIABLES

		This method loads two files (.txt for labels information and 
		.npy for documents-term matrix) from local file system to 
		initialize a text analysor instance. 
		'''
        if not os.path.exists(file_path +
                              '.txt') or not os.path.exists(file_path +
                                                            '.npy'):
            print >> sys.stderr, '[WARN] Loading failed. Invalid file path: %s' % file_path
            return
        # Load the document-term matrix
        self.dt_matrix = np.load(file_path + '.npy').tolist()
        # Load the labels information
        with open(file_path + '.txt', 'r') as f:
            try:
                labels = f.readlines()
                self.labels = [
                    list(set(label.strip('\n').split('#'))) for label in labels
                ]
            except:
                print >> sys.stderr, '[ERROR] Loading failed. Unknown error'

    ####################################
    # Global Analysis
    ####################################

    def fuzzy_LSA(self, n_components_for_svd=2):
        print >> sys.stderr, '[TEXT]\t%s\tFuzzy LSA ...' % arrow.now()
        # Tf-idf Transformation
        tfidf = TfidfTransformer()
        tfidf_matrix = tfidf.fit_transform(self.dt_matrix).toarray()
        # SVD
        # n_components is recommended to be 100 by Sklearn Documentation for LSA
        # http://scikit-learn.org/stable/modules/generated/sklearn.decomposition.TruncatedSVD.html
        svd = TruncatedSVD(n_components=n_components_for_svd)
        svd_matrix = svd.fit_transform(tfidf_matrix)
        # print >> sys.stderr, tfidf_matrix
        # print >> sys.stderr, svd_matrix
        feature_matrix = svd_matrix.tolist()
        return feature_matrix, \
               self._sort_by_labels(feature_matrix), \
               tfidf_matrix.tolist()

    def regular_LSA(self, n_components_for_svd=2):
        print >> sys.stderr, '[TEXT]\t%s\tRegular LSA ...' % arrow.now()
        self.words_analysor.LSA(n_components_for_svd=n_components_for_svd)
        feature_matrix = self.words_analysor.svd_matrix.tolist()
        return feature_matrix, \
               self._sort_by_labels(feature_matrix), \
               self.words_analysor.tfidf_matrix.tolist(), \
               self.words_analysor.dt_matrix.tolist(), \
               self.words_analysor.feature_names

    # def fuzzy_LDA(self, n_topics_for_lda=2):
    # 	print >> sys.stderr, '[TEXT]\t%s\tFuzzy LDA ...' % arrow.now()
    # 	feature_matrix = LatentDirichletAllocation(
    # 	    n_topics=n_topics_for_lda,
    # 	    max_iter=5,
    # 	        learning_method='online',
    # 	        learning_offset=50.,
    # 	        random_state=0
    # 	    ).fit_transform(self.dt_matrix).tolist()
    # 	    return feature_matrix, \
    # 	      self._sort_by_labels(feature_matrix)

    # def regular_LDA(self, n_topics_for_lda=2):
    # 	print >> sys.stderr, '[TEXT]\t%s\tRegular LDA ...' % arrow.now()
    # 	self.words_analysor.LDA(n_topics=n_topics_for_lda)
    # 	feature_matrix = self.words_analysor.lda_matrix.tolist()
    # 	return feature_matrix, \
    # 	       self._sort_by_labels(feature_matrix), \
    # 	       self.words_analysor.dt_matrix.tolist(), \
    # 	       self.words_analysor.feature_names

    def _sort_by_labels(self, feature_matrix):
        # Get the set for all the labels that appearred
        labels_set = list(
            set([item for sublist in self.labels for item in sublist]))
        label_feature_dict = {}
        for label_in_set in labels_set:
            label_feature_dict[label_in_set] = []
            for i in range(len(self.labels)):
                for label_for_feature in self.labels[i]:
                    if label_for_feature == label_in_set:
                        label_feature_dict[label_in_set].append(
                            feature_matrix[i])
        return label_feature_dict

    def set_text(self, text, label):
        '''
		'''
        # Init
        self._initialize_temporal_variables()
        # raw text
        self.text = text
        # # Init words analysor
        self.words_analysor.add_document(text)
        # Tokenize the raw text
        # print >> sys.stderr, '[TEXT]\t%s\tTokenizing ...' % arrow.now()
        self._tokenize()
        # Get the structure of the tokenized text
        # print >> sys.stderr, '[TEXT]\t%s\tGetting Structure ...' % arrow.now()
        self._get_structure()
        # Anchor the locations of keywords in the text
        # print >> sys.stderr, '[TEXT]\t%s\tAnchorring Keywords ...' % arrow.now()
        # self._anchor_keywords()
        # Find K-nearest tokens from the text to the tokens in the words_category
        # print >> sys.stderr, '[TEXT]\t%s\tFinding K nearest tokens ...' % arrow.now()
        self._find_k_nearest_tokens()
        self.dt_matrix.append(self.term_vector)
        self.labels.append(label)

    def _initialize_temporal_variables(self):
        self.sents_by_tokens = []
        self.sents_by_words = []
        self.phrases_count = {}
        self.filtered_phrases = {}
        self.length_of_sents = []
        self.length_of_text = -1
        self.structure = {}
        self.anchors = {}

    ####################################
    # Processing for A Single Document
    ####################################

    def _tokenize(self):
        self.sents_by_tokens = []
        self.sents_by_words = self.words_analysor.cur_sents_by_words
        # Take interested phrases from the text into consideration
        self.phrases_count = self.phrases_extractor.phrases_count(
            self.text)  # Get all possible phrases from the text
        self.filtered_phrases = self._phrases_filter(self.phrases_count.keys())
        # Add the filtered phrases into the MWE Tokenizer
        for p in self.filtered_phrases.keys():
            self.mwe.add_mwe(str(p).split('_'))
        # Tokenize by MWE
        for sent in self.sents_by_words:
            # Text by tokens
            sent_by_tokens = [token for token in self.mwe.tokenize(sent)]
            self.sents_by_tokens.append(sent_by_tokens)

    def _get_structure(self):
        self.length_of_sents = [len(sents) for sents in self.sents_by_tokens]
        self.length_of_text = sum(self.length_of_sents)
        self.structure = defaultdict(
            lambda: {
                # The list of indexs of the token in the whole text
                'text_indexs': [],
                # The list of indexs of the sentences in the whole text
                'sent_indexs': [],
                # The list of indexs of the token in their sentences
                'inner_indexs': []
            })
        text_i = 0
        sent_i = 0
        inner_i = 0
        for sent in self.sents_by_tokens:
            # Tokens structure info
            for token in sent:
                if token not in stopwords.words(
                        'english') and len(token) > self.WORD_MIN_LEN:
                    self.structure[token]['text_indexs'].append(text_i)
                    self.structure[token]['sent_indexs'].append(sent_i)
                    self.structure[token]['inner_indexs'].append(inner_i)
                text_i += 1
                inner_i += 1
            sent_i += 1
            inner_i = 0

    def _anchor_keywords(self):
        self.anchors = {}
        for categories in self.words_category.keys():
            category_list = categories.strip().split('/')
            similar_tokens_info = defaultdict(lambda: 0)
            for category in category_list:
                for token in self.structure.keys():
                    sim = self._phrases_similarity(category, token)
                    if sim > self.ANCHOR_MIN_SIM and sim > similar_tokens_info[
                            token]:
                        similar_tokens_info[token] = sim
            self.anchors[categories] = similar_tokens_info
        # print >> sys.stderr, json.dumps(self.anchors, indent=4)

    def _find_k_nearest_tokens(self, K=10):
        self.k_nearest_tokens = {}
        for category in self.words_category.keys():
            self.k_nearest_tokens[category] = []
        # Calculate the distance between every word/phrase in the text and category
        for category, words_in_category in self.words_category.iteritems():
            tokens_in_text = self.structure.keys()
            tokens_in_category = map(lambda x: x.encode('ascii', 'ignore'),
                                     words_in_category)
            # Calculate the matrix of distances between
            # words_in_text & words_in_category
            len_i_t = len(tokens_in_text)
            len_i_c = len(tokens_in_category)
            dist_mat = np.zeros((len_i_t, len_i_c))
            for i in range(len_i_t):
                for j in range(len_i_c):
                    if isPhrase(tokens_in_text[i]) and isPhrase(
                            tokens_in_category[j]):
                        dist_mat[i, j] = self._phrases_similarity(
                            tokens_in_text[i], tokens_in_category[j])
                    elif (not isPhrase(tokens_in_text[i])) and (not isPhrase(
                            tokens_in_category[j])):
                        dist_mat[i, j] = self._words_similarity(
                            tokens_in_text[i], tokens_in_category[j])
                    else:
                        dist_mat[i, j] = 0
            # Find the best matched token in the text for each of token under the category
            best_matched_indexs = dist_mat.argmax(
                axis=0
            )  # The index of the best matched tokens for each of the category
            best_matched_dists = [
            ]  # The distance between the best matched words and the words in text
            for j in range(len(best_matched_indexs)):
                best_matched_dists.append(dist_mat[best_matched_indexs[j], j])
            best_matched_dists = np.array(best_matched_dists)
            # Find K-nearest words (to the current category) in the text
            for k in range(K):
                j = best_matched_dists.argmax(
                )  # The index of the words in text which has the highest similarity
                i = best_matched_indexs[j]
                # If the current best matched distance is lower than 0, then abandon it.
                if best_matched_dists[j] <= 0:
                    break
                best_matched_dists[
                    j] = -1  # Remove the largest value in the best_matched_dists
                self.k_nearest_tokens[category].append({
                    'in_text':
                    tokens_in_text[i],
                    'in_category':
                    tokens_in_category[j],
                    'count':
                    len(self.structure[tokens_in_text[i]]['text_indexs']),
                    'distance':
                    dist_mat[i, j]
                    # 'rate':        self._rate_token_candidates(category, tokens_in_text[i])
                })
        # Convert term dict to numerical term vector
        self.term_vector = self._term_dict2term_vector(self.k_nearest_tokens)
        # print >> sys.stderr, json.dumps(self.k_nearest_tokens, indent=4)

    ####################################
    # Utilities
    ####################################

    def _rate_token_candidates(self, category, candidate_token):
        if not bool(self.anchors[category]):
            return 0
        else:
            dist = np.array([
                self._tokens_min_distance(candidate_token, anchor_token)
                for anchor_token in self.anchors[category].keys()
            ]).astype('float')
            # anchor_sim = np.array([self.anchors[category][anchor_token] for anchor_token in self.anchors[category].keys()]).astype('float')
            anchor_sim = np.array(
                self.anchors[category].values()).astype('float')
            # Rate: determine which token candidate under a category in the text is the most informative, and
            #       most accurate item as to the category.
            # rate = max(anchor_sim * ((1.0 - dist[:,0] / self.length_of_text) ** dist[:,1]))
            rate = max(
                (1.0 - dist[:, 0] / self.length_of_text)**(dist[:, 1] + 1.0))
            return rate

    def _phrases_filter(self, phrases):
        filtered_phrases = {}
        for p in phrases:
            sims = [
                self._phrases_similarity(p, p_i)
                for p_i in self.interested_phrases
            ]
            # Remove irrelevant phrases according to the interested phrases list
            if max(sims) > self.PHRASE_MIN_SIM:
                filtered_phrases[p] = {}
                filtered_phrases[p][
                    'similar_phrase'] = self.interested_phrases[np.argmax(
                        sims)]
                filtered_phrases[p]['similarity'] = max(sims)
        return filtered_phrases

    def _words_similarity(self, word_A, word_B):
        try:
            similarity = self.word2vec_model.similarity(word_A, word_B)
        except KeyError, m:
            # TODO
            if word_A == word_B:
                similarity = 1
            else:
                similarity = 0
        return similarity