Ejemplo n.º 1
0
def get_route(msg):
    """Returns a map with keys 'origin'" and 'arrival_location'."""
    tokenizer = MWETokenizer(CITY_TOKENS)
    route = {'origin': None, 'destination': None}
    tokens = tokenizer.tokenize(msg.lower().split(' '))

    def lookahead(start_idx):
        """Returns a slice of the tokens list starting at index start_idx."""
        end_idx = min(start_idx + TOKEN_LOOKAHEAD, len(tokens))
        words = ['from', 'to', 'on']
        for i in xrange(start_idx + 1, end_idx):
            if tokens[i] in ['from', 'to', 'on']:
                end_idx = i
                break
        return tokens[start_idx:end_idx]

    for i in xrange(len(tokens) - 1):
        if tokens[i] in ['from', 'to']:
            city_tokens = lookahead(i + 1)
            city = determine_city(city_tokens)
            if city is None:
                print "City not recognized: {}".format(' '.join(city_tokens))
            else:
                if tokens[i] == 'from':
                    route['origin'] = city
                elif tokens[i] == 'to':
                    route['destination'] = city
    return route
Ejemplo n.º 2
0
    def __init__(self):
        print >> sys.stderr, '[TEXT]\t%s\t*** Initializing Text Object ***' % arrow.now(
        )
        # Read Configuration from ini file
        conf = Config(self.INI_PATH)
        phrases_extractor_path = conf.config_section_map('Model')['n_gram']
        word2vec_model_path = conf.config_section_map('Model')['word2vec']
        words_category_path = conf.config_section_map('Corpus')['key_words']

        # Variable initialization
        # - key words and their related words
        self.words_category = None
        with open(words_category_path, 'rb') as f:
            self.words_category = json.load(f)
        # - all of the related words in the words_category
        print >> sys.stderr, '[TEXT]\t%s\tLoading n-Gram model ...' % arrow.now(
        )
        self.interested_phrases = list(
            set([
                item for sublist in self.words_category.values()  # Get sublist
                for item in sublist  # Merge sublist
                if isPhrase(item)  # Filter non phrases
            ]))
        # - word2vec model
        print >> sys.stderr, '[TEXT]\t%s\tLoading word2vec model ...' % arrow.now(
        )
        self.word2vec_model = Word2Vec.load_word2vec_format(
            word2vec_model_path, binary=True)
        print >> sys.stderr, '[TEXT]\t'
        # - phrases extractor (n-gram kernel)
        self.phrases_extractor = PhrasesExtractor(
            phrases_extractor_path, interested_phrases=self.interested_phrases)
        # - MWE Tokenizer
        self.mwe = MWETokenizer()
        # Init words analysor
        self.words_analysor = WordsAnalysor()
        # Document-Term Vectors
        self.dt_matrix = []
        # Labels for documents
        self.labels = []
Ejemplo n.º 3
0
def tokenize():
    """
      Pull phrases from the corpus.  For example, to distinguish 
      between orange color and orange flavor, or to determine the strength
      of the flavor (light_citrus) or the carbonation (strong_carbonation)
    """
    global review_df
    from phrases import phrase_map, phrases, synonym_map
    phrase_tokenizer = MWETokenizer(phrases)
    stop_word_list = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()

    def process(x):
        return [
            phrase_map.get(word, word) for word in phrase_tokenizer.tokenize([
                lemmatizer.lemmatize(synonym_map.get(word, word)) for word in x
                if word not in stop_word_list
            ])
        ]

    b = bag.from_sequence(review_df['review_pp1'].str.split())
    mapped = b.map(process)
    review_df['review_pp1'] = pd.Series(mapped.compute()).str.join(' ')
Ejemplo n.º 4
0
import sys
import sqlite3 as sql
import nltk
import string
import csv
import time
import re
import os

from collections import Counter
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.tokenize.mwe import MWETokenizer

multi_word_tokenizer = MWETokenizer()
# multi_word_tokenizer.add_mwe(("Multiple", "Sklerose"))

con = None
rows = []

chapter_ids = [
    "section1", "section2", "section3", "section4", "section5", "section6",
    "section7", "section8", "section9", "section10", "section11", "section12",
    "section13", "section14", "section15", "section16", "Section7000",
    "Section7050", "Section7100", "Section7150", "Section7200", "Section7250",
    "Section7350", "Section7400", "Section7450", "Section7500", "Section7550",
    "Section7600", "Section7650", "Section7700"
]

def keyword_frequent(all_text):  # text: a list
    synonyms = {}
    feature_1 = ["English", "Cantonese", "Chinese"]
    feature_2 = ["Supervise", "Coach", "Team", "Staff"]
    feature_3 = [
        "Digital Marketing", "Digital Media Buy", "Search Engine Marketing",
        "Search Engine Optimization", "Mobile", "Social Media",
        "Content Calendar", "Performance Marketing", "Channel", "Paid Social",
        "Programmatic Display", "Remarketing", "Social Campaign",
        "Webiste Content", "KOLs", "Content Marketing", "Digital Analytics"
    ]
    feature_4 = [
        "Analysis", "Budget", "ROI", "KPI", "Forecasting", "Program",
        "Competitor Analysis"
    ]
    feature_5 = ["University", "College"]

    feature = {
        "Language": feature_1,
        "Product_Experience": feature_2,
        "Functional_Experience": feature_3,
        "Digital_Marketing_Strategy": feature_4,
        "Education": feature_5
    }

    synonyms["Chinese"] = ["Mandarin", "Putonghua"]
    synonyms["Team"] = ["Team building"]
    synonyms["Digital Marketing"] = [
        "Online", "eDM", "Electronic Direct Marketing"
    ]
    synonyms["Digital Media Buy"] = ["banner ads", "landing page"]
    synonyms["Search Engine Marketing"] = ["SEM"]
    synonyms["Search Engine Optimization"] = ["SEO"]
    synonyms["Social Media"] = [
        "Facebook", "WeChat", "Twitter", "Instagram", "IG", "Snapchat", "Line",
        "Myspace", "Flickr", "LinkedIn", "Xing"
    ]
    synonyms["KOLs"] = ["Key Opinion Leaders"]
    synonyms["ROI"] = ["Return on investment"]

    all_keywords = []
    for key in feature:
        all_keywords += feature[key]

    tokenizer = MWETokenizer([tuple(x.lower().split()) for x in all_keywords])
    all_frequency = nltk.FreqDist(
        tokenizer.tokenize(nltk.word_tokenize("\n".join(all_text).lower())))

    all_keywords_frequency = {}
    for key in feature.keys():
        freq_dict = {}
        for keyword in feature[key]:
            freq = all_frequency["_".join(keyword.lower().split())]
            # print keyword, expFreq["_".join(keyword.split())]
            if keyword in synonyms.keys():
                for syn in synonyms[keyword]:
                    # print keyword, syn, expFreq["_".join(syn.split())]
                    freq += all_frequency["_".join(syn.lower().split())]
            freq_dict[keyword] = freq
        all_keywords_frequency[key] = freq_dict

    return all_keywords_frequency