Esempio n. 1
0
def get_route(msg):
    """Returns a map with keys 'origin'" and 'arrival_location'."""
    tokenizer = MWETokenizer(CITY_TOKENS)
    route = {'origin': None, 'destination': None}
    tokens = tokenizer.tokenize(msg.lower().split(' '))

    def lookahead(start_idx):
        """Returns a slice of the tokens list starting at index start_idx."""
        end_idx = min(start_idx + TOKEN_LOOKAHEAD, len(tokens))
        words = ['from', 'to', 'on']
        for i in xrange(start_idx + 1, end_idx):
            if tokens[i] in ['from', 'to', 'on']:
                end_idx = i
                break
        return tokens[start_idx:end_idx]

    for i in xrange(len(tokens) - 1):
        if tokens[i] in ['from', 'to']:
            city_tokens = lookahead(i + 1)
            city = determine_city(city_tokens)
            if city is None:
                print "City not recognized: {}".format(' '.join(city_tokens))
            else:
                if tokens[i] == 'from':
                    route['origin'] = city
                elif tokens[i] == 'to':
                    route['destination'] = city
    return route
def mwe_tokenize(tokens, bigrams):
    #Retokenizes tokenized text to combine MWEs from list of most common
#    with open('bigrams_MWEs.pkl', 'rb') as fid:
 #       bigrams = cPickle.load(fid) 

    tokenizer = MWETokenizer(mwes= bigrams[:100], separator='+')
    tokens = tokenizer.tokenize(tokens)
    return tokens 
def mwe_tokenize(tokens, bigrams):
    tokenizer = MWETokenizer(mwes=bigrams[:100], separator='+')
    tokens = tokenizer.tokenize(tokens)
    return tokens
def keyword_frequent(all_text):  # text: a list
    synonyms = {}
    feature_1 = ["English", "Cantonese", "Chinese"]
    feature_2 = ["Supervise", "Coach", "Team", "Staff"]
    feature_3 = [
        "Digital Marketing", "Digital Media Buy", "Search Engine Marketing",
        "Search Engine Optimization", "Mobile", "Social Media",
        "Content Calendar", "Performance Marketing", "Channel", "Paid Social",
        "Programmatic Display", "Remarketing", "Social Campaign",
        "Webiste Content", "KOLs", "Content Marketing", "Digital Analytics"
    ]
    feature_4 = [
        "Analysis", "Budget", "ROI", "KPI", "Forecasting", "Program",
        "Competitor Analysis"
    ]
    feature_5 = ["University", "College"]

    feature = {
        "Language": feature_1,
        "Product_Experience": feature_2,
        "Functional_Experience": feature_3,
        "Digital_Marketing_Strategy": feature_4,
        "Education": feature_5
    }

    synonyms["Chinese"] = ["Mandarin", "Putonghua"]
    synonyms["Team"] = ["Team building"]
    synonyms["Digital Marketing"] = [
        "Online", "eDM", "Electronic Direct Marketing"
    ]
    synonyms["Digital Media Buy"] = ["banner ads", "landing page"]
    synonyms["Search Engine Marketing"] = ["SEM"]
    synonyms["Search Engine Optimization"] = ["SEO"]
    synonyms["Social Media"] = [
        "Facebook", "WeChat", "Twitter", "Instagram", "IG", "Snapchat", "Line",
        "Myspace", "Flickr", "LinkedIn", "Xing"
    ]
    synonyms["KOLs"] = ["Key Opinion Leaders"]
    synonyms["ROI"] = ["Return on investment"]

    all_keywords = []
    for key in feature:
        all_keywords += feature[key]

    tokenizer = MWETokenizer([tuple(x.lower().split()) for x in all_keywords])
    all_frequency = nltk.FreqDist(
        tokenizer.tokenize(nltk.word_tokenize("\n".join(all_text).lower())))

    all_keywords_frequency = {}
    for key in feature.keys():
        freq_dict = {}
        for keyword in feature[key]:
            freq = all_frequency["_".join(keyword.lower().split())]
            # print keyword, expFreq["_".join(keyword.split())]
            if keyword in synonyms.keys():
                for syn in synonyms[keyword]:
                    # print keyword, syn, expFreq["_".join(syn.split())]
                    freq += all_frequency["_".join(syn.lower().split())]
            freq_dict[keyword] = freq
        all_keywords_frequency[key] = freq_dict

    return all_keywords_frequency
Esempio n. 5
0
class TextAnalysor:
    '''
	Text

	This is a class for processing the raw text information, 
	extracting useful features and also providing user-friendly data API.

	Essential public attributes:
	* dt_matrix: documents-term (documents-feature) matrix
	* labels:    labels for each of the documents
	'''
    INI_PATH = 'conf/text.ini'
    WORD_MIN_LEN = 2
    ANCHOR_MIN_SIM = 0.5
    PHRASE_MIN_SIM = 0.8

    def __init__(self):
        print >> sys.stderr, '[TEXT]\t%s\t*** Initializing Text Object ***' % arrow.now(
        )
        # Read Configuration from ini file
        conf = Config(self.INI_PATH)
        phrases_extractor_path = conf.config_section_map('Model')['n_gram']
        word2vec_model_path = conf.config_section_map('Model')['word2vec']
        words_category_path = conf.config_section_map('Corpus')['key_words']

        # Variable initialization
        # - key words and their related words
        self.words_category = None
        with open(words_category_path, 'rb') as f:
            self.words_category = json.load(f)
        # - all of the related words in the words_category
        print >> sys.stderr, '[TEXT]\t%s\tLoading n-Gram model ...' % arrow.now(
        )
        self.interested_phrases = list(
            set([
                item for sublist in self.words_category.values()  # Get sublist
                for item in sublist  # Merge sublist
                if isPhrase(item)  # Filter non phrases
            ]))
        # - word2vec model
        print >> sys.stderr, '[TEXT]\t%s\tLoading word2vec model ...' % arrow.now(
        )
        self.word2vec_model = Word2Vec.load_word2vec_format(
            word2vec_model_path, binary=True)
        print >> sys.stderr, '[TEXT]\t'
        # - phrases extractor (n-gram kernel)
        self.phrases_extractor = PhrasesExtractor(
            phrases_extractor_path, interested_phrases=self.interested_phrases)
        # - MWE Tokenizer
        self.mwe = MWETokenizer()
        # Init words analysor
        self.words_analysor = WordsAnalysor()
        # Document-Term Vectors
        self.dt_matrix = []
        # Labels for documents
        self.labels = []

    def save_variables(self, file_path):
        '''
		SAVE VARIABLES

		This method would save the text analysor in two files, one is 
		a .npy file stores the documents-term matrix, and the other one
		is a text file stores the labels.  
		'''
        # Save the document-term matrix
        np.save(file_path, self.dt_matrix)
        # Save the labels information
        labels = [
            '#'.join(multiple_labels) + '\n' for multiple_labels in self.labels
        ]
        with open(file_path + '.txt', 'w') as f:
            try:
                f.writelines(labels)
            except:
                print >> sys.stderr, '[ERROR] Saving failed. Invalid file path: %s' % file_path

    def load_variables(self, file_path):
        '''
		LOAD VARIABLES

		This method loads two files (.txt for labels information and 
		.npy for documents-term matrix) from local file system to 
		initialize a text analysor instance. 
		'''
        if not os.path.exists(file_path +
                              '.txt') or not os.path.exists(file_path +
                                                            '.npy'):
            print >> sys.stderr, '[WARN] Loading failed. Invalid file path: %s' % file_path
            return
        # Load the document-term matrix
        self.dt_matrix = np.load(file_path + '.npy').tolist()
        # Load the labels information
        with open(file_path + '.txt', 'r') as f:
            try:
                labels = f.readlines()
                self.labels = [
                    list(set(label.strip('\n').split('#'))) for label in labels
                ]
            except:
                print >> sys.stderr, '[ERROR] Loading failed. Unknown error'

    ####################################
    # Global Analysis
    ####################################

    def fuzzy_LSA(self, n_components_for_svd=2):
        print >> sys.stderr, '[TEXT]\t%s\tFuzzy LSA ...' % arrow.now()
        # Tf-idf Transformation
        tfidf = TfidfTransformer()
        tfidf_matrix = tfidf.fit_transform(self.dt_matrix).toarray()
        # SVD
        # n_components is recommended to be 100 by Sklearn Documentation for LSA
        # http://scikit-learn.org/stable/modules/generated/sklearn.decomposition.TruncatedSVD.html
        svd = TruncatedSVD(n_components=n_components_for_svd)
        svd_matrix = svd.fit_transform(tfidf_matrix)
        # print >> sys.stderr, tfidf_matrix
        # print >> sys.stderr, svd_matrix
        feature_matrix = svd_matrix.tolist()
        return feature_matrix, \
               self._sort_by_labels(feature_matrix), \
               tfidf_matrix.tolist()

    def regular_LSA(self, n_components_for_svd=2):
        print >> sys.stderr, '[TEXT]\t%s\tRegular LSA ...' % arrow.now()
        self.words_analysor.LSA(n_components_for_svd=n_components_for_svd)
        feature_matrix = self.words_analysor.svd_matrix.tolist()
        return feature_matrix, \
               self._sort_by_labels(feature_matrix), \
               self.words_analysor.tfidf_matrix.tolist(), \
               self.words_analysor.dt_matrix.tolist(), \
               self.words_analysor.feature_names

    # def fuzzy_LDA(self, n_topics_for_lda=2):
    # 	print >> sys.stderr, '[TEXT]\t%s\tFuzzy LDA ...' % arrow.now()
    # 	feature_matrix = LatentDirichletAllocation(
    # 	    n_topics=n_topics_for_lda,
    # 	    max_iter=5,
    # 	        learning_method='online',
    # 	        learning_offset=50.,
    # 	        random_state=0
    # 	    ).fit_transform(self.dt_matrix).tolist()
    # 	    return feature_matrix, \
    # 	      self._sort_by_labels(feature_matrix)

    # def regular_LDA(self, n_topics_for_lda=2):
    # 	print >> sys.stderr, '[TEXT]\t%s\tRegular LDA ...' % arrow.now()
    # 	self.words_analysor.LDA(n_topics=n_topics_for_lda)
    # 	feature_matrix = self.words_analysor.lda_matrix.tolist()
    # 	return feature_matrix, \
    # 	       self._sort_by_labels(feature_matrix), \
    # 	       self.words_analysor.dt_matrix.tolist(), \
    # 	       self.words_analysor.feature_names

    def _sort_by_labels(self, feature_matrix):
        # Get the set for all the labels that appearred
        labels_set = list(
            set([item for sublist in self.labels for item in sublist]))
        label_feature_dict = {}
        for label_in_set in labels_set:
            label_feature_dict[label_in_set] = []
            for i in range(len(self.labels)):
                for label_for_feature in self.labels[i]:
                    if label_for_feature == label_in_set:
                        label_feature_dict[label_in_set].append(
                            feature_matrix[i])
        return label_feature_dict

    def set_text(self, text, label):
        '''
		'''
        # Init
        self._initialize_temporal_variables()
        # raw text
        self.text = text
        # # Init words analysor
        self.words_analysor.add_document(text)
        # Tokenize the raw text
        # print >> sys.stderr, '[TEXT]\t%s\tTokenizing ...' % arrow.now()
        self._tokenize()
        # Get the structure of the tokenized text
        # print >> sys.stderr, '[TEXT]\t%s\tGetting Structure ...' % arrow.now()
        self._get_structure()
        # Anchor the locations of keywords in the text
        # print >> sys.stderr, '[TEXT]\t%s\tAnchorring Keywords ...' % arrow.now()
        # self._anchor_keywords()
        # Find K-nearest tokens from the text to the tokens in the words_category
        # print >> sys.stderr, '[TEXT]\t%s\tFinding K nearest tokens ...' % arrow.now()
        self._find_k_nearest_tokens()
        self.dt_matrix.append(self.term_vector)
        self.labels.append(label)

    def _initialize_temporal_variables(self):
        self.sents_by_tokens = []
        self.sents_by_words = []
        self.phrases_count = {}
        self.filtered_phrases = {}
        self.length_of_sents = []
        self.length_of_text = -1
        self.structure = {}
        self.anchors = {}

    ####################################
    # Processing for A Single Document
    ####################################

    def _tokenize(self):
        self.sents_by_tokens = []
        self.sents_by_words = self.words_analysor.cur_sents_by_words
        # Take interested phrases from the text into consideration
        self.phrases_count = self.phrases_extractor.phrases_count(
            self.text)  # Get all possible phrases from the text
        self.filtered_phrases = self._phrases_filter(self.phrases_count.keys())
        # Add the filtered phrases into the MWE Tokenizer
        for p in self.filtered_phrases.keys():
            self.mwe.add_mwe(str(p).split('_'))
        # Tokenize by MWE
        for sent in self.sents_by_words:
            # Text by tokens
            sent_by_tokens = [token for token in self.mwe.tokenize(sent)]
            self.sents_by_tokens.append(sent_by_tokens)

    def _get_structure(self):
        self.length_of_sents = [len(sents) for sents in self.sents_by_tokens]
        self.length_of_text = sum(self.length_of_sents)
        self.structure = defaultdict(
            lambda: {
                # The list of indexs of the token in the whole text
                'text_indexs': [],
                # The list of indexs of the sentences in the whole text
                'sent_indexs': [],
                # The list of indexs of the token in their sentences
                'inner_indexs': []
            })
        text_i = 0
        sent_i = 0
        inner_i = 0
        for sent in self.sents_by_tokens:
            # Tokens structure info
            for token in sent:
                if token not in stopwords.words(
                        'english') and len(token) > self.WORD_MIN_LEN:
                    self.structure[token]['text_indexs'].append(text_i)
                    self.structure[token]['sent_indexs'].append(sent_i)
                    self.structure[token]['inner_indexs'].append(inner_i)
                text_i += 1
                inner_i += 1
            sent_i += 1
            inner_i = 0

    def _anchor_keywords(self):
        self.anchors = {}
        for categories in self.words_category.keys():
            category_list = categories.strip().split('/')
            similar_tokens_info = defaultdict(lambda: 0)
            for category in category_list:
                for token in self.structure.keys():
                    sim = self._phrases_similarity(category, token)
                    if sim > self.ANCHOR_MIN_SIM and sim > similar_tokens_info[
                            token]:
                        similar_tokens_info[token] = sim
            self.anchors[categories] = similar_tokens_info
        # print >> sys.stderr, json.dumps(self.anchors, indent=4)

    def _find_k_nearest_tokens(self, K=10):
        self.k_nearest_tokens = {}
        for category in self.words_category.keys():
            self.k_nearest_tokens[category] = []
        # Calculate the distance between every word/phrase in the text and category
        for category, words_in_category in self.words_category.iteritems():
            tokens_in_text = self.structure.keys()
            tokens_in_category = map(lambda x: x.encode('ascii', 'ignore'),
                                     words_in_category)
            # Calculate the matrix of distances between
            # words_in_text & words_in_category
            len_i_t = len(tokens_in_text)
            len_i_c = len(tokens_in_category)
            dist_mat = np.zeros((len_i_t, len_i_c))
            for i in range(len_i_t):
                for j in range(len_i_c):
                    if isPhrase(tokens_in_text[i]) and isPhrase(
                            tokens_in_category[j]):
                        dist_mat[i, j] = self._phrases_similarity(
                            tokens_in_text[i], tokens_in_category[j])
                    elif (not isPhrase(tokens_in_text[i])) and (not isPhrase(
                            tokens_in_category[j])):
                        dist_mat[i, j] = self._words_similarity(
                            tokens_in_text[i], tokens_in_category[j])
                    else:
                        dist_mat[i, j] = 0
            # Find the best matched token in the text for each of token under the category
            best_matched_indexs = dist_mat.argmax(
                axis=0
            )  # The index of the best matched tokens for each of the category
            best_matched_dists = [
            ]  # The distance between the best matched words and the words in text
            for j in range(len(best_matched_indexs)):
                best_matched_dists.append(dist_mat[best_matched_indexs[j], j])
            best_matched_dists = np.array(best_matched_dists)
            # Find K-nearest words (to the current category) in the text
            for k in range(K):
                j = best_matched_dists.argmax(
                )  # The index of the words in text which has the highest similarity
                i = best_matched_indexs[j]
                # If the current best matched distance is lower than 0, then abandon it.
                if best_matched_dists[j] <= 0:
                    break
                best_matched_dists[
                    j] = -1  # Remove the largest value in the best_matched_dists
                self.k_nearest_tokens[category].append({
                    'in_text':
                    tokens_in_text[i],
                    'in_category':
                    tokens_in_category[j],
                    'count':
                    len(self.structure[tokens_in_text[i]]['text_indexs']),
                    'distance':
                    dist_mat[i, j]
                    # 'rate':        self._rate_token_candidates(category, tokens_in_text[i])
                })
        # Convert term dict to numerical term vector
        self.term_vector = self._term_dict2term_vector(self.k_nearest_tokens)
        # print >> sys.stderr, json.dumps(self.k_nearest_tokens, indent=4)

    ####################################
    # Utilities
    ####################################

    def _rate_token_candidates(self, category, candidate_token):
        if not bool(self.anchors[category]):
            return 0
        else:
            dist = np.array([
                self._tokens_min_distance(candidate_token, anchor_token)
                for anchor_token in self.anchors[category].keys()
            ]).astype('float')
            # anchor_sim = np.array([self.anchors[category][anchor_token] for anchor_token in self.anchors[category].keys()]).astype('float')
            anchor_sim = np.array(
                self.anchors[category].values()).astype('float')
            # Rate: determine which token candidate under a category in the text is the most informative, and
            #       most accurate item as to the category.
            # rate = max(anchor_sim * ((1.0 - dist[:,0] / self.length_of_text) ** dist[:,1]))
            rate = max(
                (1.0 - dist[:, 0] / self.length_of_text)**(dist[:, 1] + 1.0))
            return rate

    def _phrases_filter(self, phrases):
        filtered_phrases = {}
        for p in phrases:
            sims = [
                self._phrases_similarity(p, p_i)
                for p_i in self.interested_phrases
            ]
            # Remove irrelevant phrases according to the interested phrases list
            if max(sims) > self.PHRASE_MIN_SIM:
                filtered_phrases[p] = {}
                filtered_phrases[p][
                    'similar_phrase'] = self.interested_phrases[np.argmax(
                        sims)]
                filtered_phrases[p]['similarity'] = max(sims)
        return filtered_phrases

    def _words_similarity(self, word_A, word_B):
        try:
            similarity = self.word2vec_model.similarity(word_A, word_B)
        except KeyError, m:
            # TODO
            if word_A == word_B:
                similarity = 1
            else:
                similarity = 0
        return similarity