Exemple #1
0
def tokenize(multi_word_queries, text):
    """Returns a list of words that make up the text.
    Params: {text: String}
    Returns: List
    """
    lower_case = text.lower()
    tokenizer = RegexpTokenizer(
        'not\s+very\s+[a-z]+|not\s+[a-z]+|no\s+[a-z]+|[a-z]+')
    result = tokenizer.tokenize(lower_case)
    multi_tokenizer = MWETokenizer([('working', 'out'), ('coffee', 'shops'),
                                    ('average', 'prices'), ('union', 'square'),
                                    ('real', 'estate'), ('ice', 'cream'),
                                    ('whole', 'foods'), ('co', 'op'),
                                    ('wall', 'street'), ('world', 'trade'),
                                    ('high', 'school'), ('dim', 'sum'),
                                    ('empire', 'state'), ('high', 'rise'),
                                    ('walk', 'ups')])
    if len(multi_word_queries) > 0:
        for tok in multi_word_queries:
            if (len(tok.split('_')) > 1):
                multi_tokenizer.add_mwe(tuple(tok.split('_')))
    #add neighborhood names
    for n in neighborhood_name_phrases:
        multi_tokenizer.add_mwe(tuple(n.split('_')))

    result2 = multi_tokenizer.tokenize(result)
    return result2
Exemple #2
0
def trim_bio(text):

    # keywords to return
    keywords = []

    # load from file after custom edit
    df_keyword = pd.read_csv(local_data + "data/keywords/df.csv")

    ## convert df to list
    important_words = df_keyword["Unnamed: 0"].tolist()

    ## format important words so that they can be registered to tokenizer
    important_words = [x.split() for x in important_words]

    # initialize tokenizer
    tokenizer = MWETokenizer()
    for iw in important_words:
        tokenizer.add_mwe([x for x in iw])  # add important words
        #tokenizer.add_mwe(iw)  # add important words

    # tokenize bio
    tokens = tokenizer.tokenize([word.lower() for word in text.split()])

    # find important words from tokens, append it to keyword
    for iw in important_words:
        iw_joined = "_".join(iw)
        if (iw_joined in tokens):
            keywords.append(iw_joined)

    return keywords
Exemple #3
0
def tokenization(docs):
    documents = {}

    for doc in docs:
        document_plain = docs[doc]
        document_plain = document_plain.replace("/", "").replace("-", " ")
        #re.sub(r'\([^)]*\)', '', document_plain)
        re.sub(r'\([0-9]*\)', '', document_plain)

        relevant_words = []
        mwetokenizer = MWETokenizer()
        document_ner = spacy_nlp(document_plain)

        for element in document_ner.ents:
            # don't consider numbers
            if element.label_ not in "CARDINAL":
                relevant_words.append(element)

        #for each relevant word, if whitespace is present, create a single token with all the words
        for word in relevant_words:
            token = str(word).split()
            if len(token) > 1:
                move_data = []
                for element in token:
                    move_data.append(element)
                tup = tuple(move_data)
                mwetokenizer.add_mwe(tup)

        document_tokenized = word_tokenize(document_plain)
        document_retokenized = mwetokenizer.tokenize(document_tokenized)

        documents[doc] = document_retokenized
    return documents
Exemple #4
0
 def get_context(self, query_str, text, k=10):
     if query_str in text:
         tokenizer = MWETokenizer()
         query_str_tokens = tuple(query_str.split())
         query_str_dashed = "_".join(query_str_tokens)
         tokenizer.add_mwe(query_str_tokens)
         text_token = tokenizer.tokenize(text.split())
         try:
             t_start = text_token.index(query_str_dashed)
         except:
             return None, None, None
         t_end = t_start + 1
         start_index = max(t_start - k, 0)
         end_index = min(t_end + k, len(text_token))
         text_token_query = text_token[start_index:t_start] + text_token[
             t_end + 1:end_index]
         context = " ".join(text_token_query)
         context_mention = text_token[start_index:t_start] + [
             query_str
         ] + text_token[t_end + 1:end_index]
         context_mention = " ".join(context_mention)
         return context, text_token_query, context_mention
     else:
         logging.info('error, query not in text')
         return None, None, None
Exemple #5
0
def init_base_order_tokenizer():
    p = nltk.PorterStemmer()
    food_tokenizer = MWETokenizer()
    food_items = {}
    prices_items = {}
    image_items = {}
    cal_items = {}
    with open('sheet1.csv') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            food_item = row['Menu Item'].replace(' ', '_').lower()
            price = float(row['Price'])
            image = row['Image']
            cal = float(row['Calories'])
            image_items[food_item] = image
            food_items[food_item] = 0
            prices_items[food_item] = price
            cal_items[food_item] = cal

            items_stem = [
                p.stem(i) for i in row['Menu Item'].lower().split(' ')
            ]
            if len(items_stem) > 1:
                food_tokenizer.add_mwe(tuple(items_stem))

    with open('mwe.csv') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            items_stem = [
                p.stem(i) for i in row['Menu Item'].lower().split(' ')
            ]
            if len(items_stem) > 1:
                food_tokenizer.add_mwe(tuple(items_stem))

    return food_tokenizer, food_items, prices_items, cal_items, image_items
Exemple #6
0
def initialize_known_phrase_tokenization(phrases):
    from nltk.tokenize import MWETokenizer
    tokenizer = MWETokenizer()
    for phrase in phrases:
        if (phrase):
            phrase_as_list = phrase.replace("_", " ").split()
            tokenizer.add_mwe(phrase_as_list)
    return tokenizer
def phrase_eval(params):
    list_phrases, unigram_set, target_token, idf, agg_score, pid = params

    idf_list = [*idf]
    idf_set = set(idf_list)

    tokenizer = MWETokenizer(separator=' ')
    for e in unigram_set:
        tokenizer.add_mwe(nltk.word_tokenize(e))

    phrases_score = {}
    for phrase in tqdm(list_phrases,
                       desc='phrase-eval-{}'.format(pid),
                       mininterval=10):
        score = 0
        tokens = nltk.word_tokenize(phrase)
        if not set(tokens).issubset(idf_set):
            continue
        nonstop_tokens = [token for token in tokens if token not in stop]
        if len(nonstop_tokens) / len(tokens) <= 0.5:
            continue
        raw_tokenized = tokenizer.tokenize(tokens)
        tokenized_set = set(raw_tokenized)
        keywords = tokenized_set.intersection(unigram_set)
        for token in keywords:
            score += agg_score[token]
        score /= (1 + np.log(len(nonstop_tokens)))

        vocab = set(target_token).union(set(tokens))
        vocab = list(vocab.intersection(idf_set))
        target_vec = [0] * len(vocab)
        phrase_vec = [0] * len(vocab)

        target_token_freq = dict(Counter(target_token))
        target_token_subset = list(set(vocab).intersection(set(target_token)))
        for token in target_token_subset:
            index = vocab.index(token)
            target_vec[index] = target_token_freq[token] / len(
                target_token) * idf[token]

        phrase_token_freq = dict(Counter(tokens))
        for token in tokens:
            index = vocab.index(token)
            phrase_vec[index] = phrase_token_freq[token] / len(
                tokens) * idf[token]

        tfidf_sim = 1 - spatial.distance.cosine(target_vec, phrase_vec)

        phrases_score.update({phrase: {'score': score, 'eval': tfidf_sim}})

    rearrange = {}
    for k, v in phrases_score.items():
        rearrange.update({k: v['score']})
    top_10 = nlargest(10, rearrange, key=rearrange.get)

    return {key: phrases_score[key] for key in top_10}
Exemple #8
0
def LoadTokenizer():
    global tokenizer
    tokenizer = MWETokenizer(separator=' ')
    for spword in WordDict:
        if ' ' in spword:
            tupleword = tuple(spword.split(' '))
            tokenizer.add_mwe(tupleword)
        if ':' in spword:
            tupleword = tuple(re.split(r"(:)", spword))
            tokenizer.add_mwe(tupleword)
Exemple #9
0
def multi_word_tokenizer(relevant_words, text):
    mwetokenizer = MWETokenizer()

    #add tuples of words into multiword tokenizer
    for word in relevant_words:
        token = str(word).split()
        move_data=[]
        for element in token:
            move_data.append(element)
        tup = tuple(move_data)
        mwetokenizer.add_mwe(tup)

    #execute multitokenization
    return mwetokenizer.tokenize(text)
Exemple #10
0
def merge_task(task_list, args):
    with open('{}/wiki_quality.txt'.format(args.entity_dir), 'r') as f:
        raw_list = f.read()
    f.close()

    nlp = spacy.load('en_core_web_lg', disable=['ner'])

    entityset = set(raw_list.split('\n'))

    tokenizer = MWETokenizer(separator=' ')

    for e in entityset:
        tokenizer.add_mwe(nltk.word_tokenize(e))

    print("successfully read entity file and initialized tokenizer")
    sys.stdout.flush()

    for fname in task_list:
        outputname = 'SENTENCE_ENTITY_{}'.format(fname.split('_')[-1])
        context = []

        with open('{}/{}'.format(args.input_dir,fname), 'r') as f:
            doc = f.readlines()
        f.close()

        for item in tqdm(doc, desc='{}'.format(fname), mininterval=30):
            item_dict = json.loads(item)
            sent = nltk.word_tokenize(item_dict['text'])
            raw_tokenized = tokenizer.tokenize(sent)
            tokenized_set = set(raw_tokenized)
            mentioned_entity = list(tokenized_set.intersection(entityset))
            if len(mentioned_entity) != 0:
                doc = nlp(item_dict['text'])
                item_dict.update({'entityMentioned':mentioned_entity})
                unigram = [token.text for token in textacy.extract.ngrams(doc,n=1,filter_nums=True, filter_punct=True, filter_stops=True)]
                item_dict['unigram'] = unigram
                tokens = [token.text for token in doc]
                pos = [token.pos_ for token in doc]
                phrases = phrasemachine.get_phrases(tokens=tokens, postags=pos)
                item_dict['phrases'] = list(phrases['counts'])
                context.append(json.dumps(item_dict))

        with open('{}/{}'.format(args.output_dir, outputname), "w+") as f:
            f.write('\n'.join(context))
        f.close()
Exemple #11
0
def phrase_eval(params):
    list_phrases, unigram_set, target_token, idf, pid = params

    idf_list = [*idf]
    idf_set = set(idf_list)

    tokenizer = MWETokenizer(separator=' ')
    for e in unigram_set:
        tokenizer.add_mwe(nltk.word_tokenize(e))

    phrases_score = []
    for phrase in list_phrases:
        score = 0
        tokens = nltk.word_tokenize(phrase)
        if not set(tokens).issubset(idf_set):
            continue
        nonstop_tokens = [token for token in tokens if token not in stop]
        if len(nonstop_tokens) / len(tokens) <= 0.5:
            continue

        vocab = set(target_token).union(set(tokens))
        vocab = list(vocab.intersection(idf_set))
        target_vec = [0] * len(vocab)
        phrase_vec = [0] * len(vocab)

        target_token_freq = dict(Counter(target_token))
        target_token_subset = list(set(vocab).intersection(set(target_token)))
        for token in target_token_subset:
            index = vocab.index(token)
            target_vec[index] = target_token_freq[token] / len(
                target_token) * idf[token]

        phrase_token_freq = dict(Counter(tokens))
        for token in tokens:
            index = vocab.index(token)
            phrase_vec[index] = phrase_token_freq[token] / len(
                tokens) * idf[token]

        tfidf_sim = 1 - spatial.distance.cosine(target_vec, phrase_vec)

        phrases_score.append([phrase, tfidf_sim])

    return phrases_score
Exemple #12
0
    def __get_tokenizer(self):
        f = open(entity_path)
        mwetokenizer = MWETokenizer([], separator=' ')
        i = 30
        while True:
            i = i - 1
            if i <= 0:
                break
            line = f.readline()
            if not line:
                break
            indexed_token = line.lower().split()
            token = indexed_token
            token.pop(0)
            token = tuple(token)
            # print(token)
            mwetokenizer.add_mwe(token)

        f.close()
        return mwetokenizer
Exemple #13
0
def trim_bio(text):

    # keywords to return
    keywords = []

    ## define important words
    #important_words = [ ["data", "science"],
    #                    ["data", "scientist"],
    #                    ["machine", "learning"],
    #                    ["data", "engineer"],
    #                    ["data", "analytics"],
    #                    ["artificial", "intelligence"],
    #                    ["ai"], ["phd"], ["founder"], ["professor"],["candidate"],["ceo"],
    #                    ["student"], ["engineer"], ["computer", "science"]
    #                    ]

    # load from file after custom edit
    df_keyword = pd.read_csv("data/keywords/df.csv")

    ## convert df to list
    important_words = df_keyword["Unnamed: 0"].tolist()

    ## format important words so that they can be registered to tokenizer
    important_words = [x.split() for x in important_words]

    # initialize tokenizer
    tokenizer = MWETokenizer()
    for iw in important_words:
        tokenizer.add_mwe([x for x in iw])  # add important words
        #tokenizer.add_mwe(iw)  # add important words

    # tokenize bio
    tokens = tokenizer.tokenize([word.lower() for word in text.split()])

    # find important words from tokens, append it to keyword
    for iw in important_words:
        iw_joined = "_".join(iw)
        if (iw_joined in tokens):
            keywords.append(iw_joined)

    return keywords
Exemple #14
0
def tokenizer_sent(dataset):
    tokenizer = MWETokenizer()
    aspect_tokenized = []
    sentence_tokenized = []
    for i in range(0, len(dataset.index)):
        aspect_split = tuple(dataset['aspect_term'][i].lower().split())
        res = tokenizer.add_mwe(aspect_split)
        aspect_tokenized.append(res)
    for j in range(0, len(dataset.index)):
        tok = nltk.pos_tag(
            tokenizer.tokenize(dataset['text'][i].lower().split()))
        sentence_tokenized.append(tok)
def merge_task(task_list, args):
    with open('{}/wiki_quality.txt'.format(args.entity_dir), 'r') as f:
        raw_list = f.read()
    f.close()

    entityset = set(raw_list.split('\n'))

    tokenizer = MWETokenizer(separator=' ')

    for e in entityset:
        tokenizer.add_mwe(nltk.word_tokenize(e))

    print("successfully read entity file and initialized tokenizer")
    sys.stdout.flush()

    for fname in task_list:
        outputname = 'SENTENCE_ENTITY_{}'.format(fname.split('_')[-1])
        context = []

        with open('{}/{}'.format(args.input_dir,fname), 'r') as f:
            doc = f.readlines()
        f.close()

        for item in tqdm(doc, desc='{}'.format(fname), mininterval=30):
            item_dict = json.loads(item)
            if set(item_dict['nsubj']).issubset(pronoun) or item_dict['nsubj'] == []:
                continue
            sent = nltk.word_tokenize(item_dict['text'])
            raw_tokenized = tokenizer.tokenize(sent)
            tokenized_set = set(raw_tokenized)
            mentioned_entity = list(tokenized_set.intersection(entityset))
            if len(mentioned_entity) != 0:
                item_dict.update({'entityMentioned':mentioned_entity})
                item_dict['iid'] = '{}{}{}'.format(item_dict['did'],item_dict['pid'],item_dict['sid'])
                context.append(json.dumps(item_dict))

        with open('{}/{}'.format(args.output_dir, outputname), "w+") as f:
            f.write('\n'.join(context))
        f.close()
Exemple #16
0
def text_process(text):
    #number removal
    if text == -2:
        return ''

    body = re.sub(r'\d+', '', text)

    #punctuation removal i.e. [!”#$%&’()*+,-./:;<=>?@[\]^_`{|}~]
    #     punc = string.punctuation
    #     punct_mapping = {"_":" ", "'":" "}
    #     punc += "“”’"
    punc = "/-'?!,#$%\'()*+-/:;<=>@\\^_`{|}~[]" + '""“”’'

    #     punc = re.sub("-","", punc)
    body = body.translate(body.maketrans(punc, " " * len(punc)))

    #text lower
    body = body.lower()

    #multi-word tokenize
    multi_word_list = [('north', 'korea'), ('south', 'korea'),
                       ('north', 'korean'), ('south', 'korean'),
                       ('kim', 'jong', 'un'), ('park', 'geun', 'hye')]
    tokenizer = MWETokenizer()
    for mw in multi_word_list:
        tokenizer.add_mwe(mw)
    text = tokenizer.tokenize(body.split())

    #stopwort removal
    stopset = set(stopwords.words('english'))
    #     text = word_tokenize(body)
    text = [x for x in text if x not in stopset]
    text = [word for word in text if len(word) > 3]

    #lemmatization
    lemmatizer = WordNetLemmatizer()
    lemma_text = [lemmatizer.lemmatize(x) for x in text]

    return lemma_text
def Tokenize(text):
    tokenizer = MWETokenizer(category.all())
    for word in category:
        if word.find(' '):
            tokenizer.add_mwe(word.split())
    for word in sub_category:
        if word.find(' '):
            tokenizer.add_mwe(word.split())
    for word in brand:
        if word.find(' '):
            tokenizer.add_mwe(word.split())
    for word in article:
        if word.find(' '):
            tokenizer.add_mwe(word.split())

    token = tokenizer.tokenize(text.split())
    tokens = []
    for word in token:
        word = word.replace("_", " ")
        tokens.append(word)
    return tokens
def main_thrd(query, num_process, input_dir, target):
    start_time = time.time()
    nlp = spacy.load('en_core_web_lg', disable=['ner'])

    ##### sentence search #####
    input_files = os.listdir(input_dir)
    tasks = list(split(input_files, num_process))

    inputs = [(tasks[i], query, input_dir) for i in range(num_process)]

    with Pool(num_process) as p:
        search_results = p.map(sent_search, inputs)

    search_merge = search_results[0]['context']
    count_merge = search_results[0]['freq']

    for pid in range(1, len(search_results)):
        tmp_context = search_results[pid]['context']
        tmp_freq = search_results[pid]['freq']
        for ent in query:
            search_merge[ent] += tmp_context[ent]
            count_merge[ent]['total'] += tmp_freq[ent]['total']
            tmp_freq[ent].pop('total', None)
            count_merge[ent].update(tmp_freq[ent])

    for ent in query:
        for index in range(len(search_merge[ent])):
            search_merge[ent][index]['doc_score'] = count_merge[ent][
                search_merge[ent][index]['did']] / count_merge[ent]['total']

    print("--- search use %s seconds ---" % (time.time() - start_time))
    sys.stdout.flush()

    start_time = time.time()
    unigrams = []
    for ent in query:
        for sent in search_merge[ent]:
            unigrams += sent['unigram']
    unigram_set = set(unigrams)

    N = 0
    cnt = Counter()
    for ent in query:
        N += len(search_merge[ent])
        for sent in search_merge[ent]:
            cnt.update(sent['tokens'])
    cnt = dict(cnt)

    for ent in query:
        unigram_set.discard(ent)

    idf = {}
    for key in cnt.keys():
        idf.update({key: np.log(N / cnt[key])})

    unigram_sents = {}
    for ent in query:
        unigram_sents.update({ent: {}})
        for sent in search_merge[ent]:
            unigram = set(sent['unigram'])
            unigram_intersect = unigram.intersection(unigram_set)
            for item in unigram_intersect:
                if item in unigram_sents[ent].keys():
                    unigram_sents[ent][item].append(sent)
                else:
                    unigram_sents[ent].update({item: [sent]})

    score_dist = {}
    for ug in unigram_set:
        score_dist.update({ug: {}})
        for ent in query:
            score_dist[ug].update({ent: 0})
            if ug in unigram_sents[ent].keys():
                did = set()
                for sent in unigram_sents[ent][ug]:
                    score_dist[ug][ent] += sent['doc_score'] * idf[ug]
                    did.add(sent['did'])

    #using rank to score unigram
    score_redist = {}
    for ent in query:
        score_redist.update({ent: dict.fromkeys(unigram_set, 0)})
        for ug in unigram_set:
            score_redist[ent][ug] = score_dist[ug][ent]
        sorted_score = sorted(score_redist[ent].items(),
                              key=lambda item: item[1],
                              reverse=True)
        rank, count, previous, result = 0, 0, None, {}
        for key, num in sorted_score:
            count += 1
            if num != previous:
                rank += count
                previous = num
                count = 0
            result[key] = 1.0 / rank
        score_redist[ent] = result

    for ug in unigram_set:
        for ent in query:
            score_dist[ug][ent] = score_redist[ent][ug]

    query_weight = []
    for ent in query:
        query_weight.append(
            1 / skew([sent['doc_score'] for sent in search_merge[ent]]))

    agg_score = {}
    for ug in score_dist.keys():
        tmp_res = [item[1] for item in score_dist[ug].items()]
        wgmean = np.exp(
            sum(query_weight * np.log(tmp_res)) / sum(query_weight))
        agg_score.update({ug: wgmean})

    score_sorted = sorted(agg_score.items(), key=lambda x: x[1], reverse=True)

    print("--- unigram score %s seconds ---" % (time.time() - start_time))
    print(score_sorted[:10])
    sys.stdout.flush()

    start_time = time.time()

    tokenizer = MWETokenizer(separator=' ')
    for ent in query:
        tokenizer.add_mwe(nltk.word_tokenize(ent))

    mined_phrases = []
    query_set = set(query)
    for ent in query:
        for sent in search_merge[ent]:
            for phrase in sent['phrases']:
                tokens = nltk.word_tokenize(phrase)
                raw_tokenized = tokenizer.tokenize(tokens)
                tokenized_set = set(raw_tokenized)
                if tokenized_set.intersection(query_set) == set():
                    mined_phrases.append(phrase)

    print("--- phrase mining %s seconds ---" % (time.time() - start_time))
    sys.stdout.flush()

    start_time = time.time()

    idf_list = [*idf]
    target_doc = nlp(target)
    target_vec = [0] * len(idf_list)
    target_token = [token.lemma_ for token in target_doc if not token.is_punct]

    list_phrases = list(set(mined_phrases))

    tasks = list(split(list_phrases, num_process))

    print('target_token', target_token)

    inputs = [(tasks[i], unigram_set, target_token, idf, agg_score, i)
              for i in range(num_process)]

    phrases_score = {}
    with Pool(num_process) as p:
        eval_results = p.map(phrase_eval, inputs)

    for tmp_res in eval_results:
        phrases_score.update(tmp_res)

    phrases_sorted = sorted(phrases_score.items(),
                            key=lambda x: x[1]['score'],
                            reverse=True)

    print("--- phrase eval use %s seconds ---" % (time.time() - start_time))
    sys.stdout.flush()

    return phrases_sorted
import nltk
from nltk.stem import WordNetLemmatizer

nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()

words = ["resource", "resources",
	"company", "companies",
	"run", "ran",
	"like", "likes"]

for w in words:
	print("{} = {}".format(w, lemmatizer.lemmatize(w)))

from nltk.tokenize import MWETokenizer

tokenizer = MWETokenizer()
tokenizer.add_mwe(('open', 'source'))
words = tokenizer.tokenize('The governance of open source projects'.split())

print(" ".join(words))
    'BG',
    'RG',
    'GG',
    'WP',
    'UP',
    'BP',
    'RP',
    'GP',
]

#create tokenizers
no_space_mwe_tokenizer = MWETokenizer(separator='')
space_mwe_tokenizer = MWETokenizer(separator=' ')

for token in no_space_mwes:
    no_space_mwe_tokenizer.add_mwe(token)
for token in space_mwes:
    space_mwe_tokenizer.add_mwe(token)


def add_starts_stops(token_list):
    return ['<START>'] + token_list + ['<STOP>']


#tokenize sentences
def tokenize(representation):
    line = representation
    for seq in seqs_to_insert_spaces_for:
        if seq in line:
            line = (' ' + seq + ' ').join(line.split(seq))
    word_level = word_tokenize(line)
for sent in morphed_names:
    list2 = []
    for w in sent:
        if w != 'COGS':
            w = w.lower()
        list2.append(w)
    morphed_names2.append(list2)

final_name_list = []
for sent in morphed_names2:
    sent = ' '.join(sent)
    final_name_list.append(sent)

for strngs in morphed_names2:
    if len(strngs) > 1:
        tokenizer.add_mwe(strngs)

df.columns = final_name_list

aggregate_words = ["total", "sum", "overall", "aggregate"]
ranking_words = {
    "biggest": 1,
    "highest": 1,
    "peak": 1,
    "top": 1,
    "smallest": 0,
    "least": 0,
    "lowest": 0,
    "bottom": 0
}
gap_words = ["gap", "difference"]
Exemple #22
0
            wd_permutations = []
            for x in range(min_perm, max_perm):
                for foods in permutations(words, x):
                    wd_permutations.append(foods)
            #wd_permutations = permutations(words, 2)
            list_wd_permutations = list(wd_permutations)

            #join tuples back together into a string and append them to the new list
            for list_wd_permutation in list_wd_permutations:

                join_wd_permutations = (' '.join(list_wd_permutation))
                list_join_wd_permutations.add(join_wd_permutations)

                #if the number of words is more than 1, then add it to the list of mwe
                if (len(list_wd_permutation) > 1):
                    mwe_tokenizer.add_mwe(list_wd_permutation)


print("\n+++++++++++\n")

print("Complete list of permutations:")
print(list_join_wd_permutations)

print("\n+++++++++++\n")


print("Complete list of multi-word expressions:")
print(mwe_tokenizer._mwes)

print("\n+++++++++++\n")
Exemple #23
0
import math
import nltk
from nltk.tokenize import MWETokenizer

# restore model from collocation.nb
model_1 = pickle.load(
    open('/home/hrrathod/project/collocations/collocation.nb', 'rb'))
# restore model from train_collocation.nb
model_2 = pickle.load(
    open('/home/hrrathod/project/collocations/train_collocation.nb', 'rb'))

# Creat empty list for the multi-word expression tokens
tokenizer = MWETokenizer([])
# Converting the collocations into a multi-word expression token
for w1, w2 in model_1:
    tokenizer.add_mwe((w1, w2))

# REPLACE PATH
# Get list of files in the test set
test_files = sorted(glob.glob('/home/hrrathod/project/test/*.txt'))

# Number of tokens in the positive reviews
pos_total_tokens = model_2['pos_fd'].N()
# Number of tokens in the neutral reviews
neu_total_tokens = model_2['neu_fd'].N()
# Number of tokens in the negative reviews
neg_total_tokens = model_2['neg_fd'].N()

# Combining all FDs
fd = model_2['pos_fd'] + model_2['neu_fd'] + model_2['neg_fd']
from nltk.corpus import stopwords
import pickle
from nltk import pos_tag
from textblob import Word
from nltk import RegexpParser
from nltk.corpus import wordnet

train = []

reader = csv.reader(open('Topic_set_train.csv', 'r'))
tokenizer = MWETokenizer()

for row in reader:
    print("Data : " + str(row))
    title, category = row
    tokenizer.add_mwe(title.split())
    wiki_page = wikipedia.page(title)
    wiki_content = str.lower(wiki_page.summary)
    tokens = tokenizer.tokenize(wiki_content.split())
    tokens_content = " ".join(tokens)
    word_tokens = word_tokenize(tokens_content)
    bigger_words = [
        k for k in word_tokens if len(k) >= 3 and not k.startswith('===')
    ]
    stop = set(stopwords.words('english'))
    stopwords_cleaned_list = [k for k in bigger_words if k not in stop]
    lemmatized_tokens = []

    for word in stopwords_cleaned_list:
        w = Word(word)
        lemmatized_tokens.append(w.lemmatize())
Exemple #25
0
        )
        print("\n")

    # Read the words of interest
    words = open("emotion_words.txt").read().lower().split("\n")
    sentiment_bag = set()

    # Get the multi-word expression tokenizer and add each to the sentiment_bag
    mwe = set(filter(lambda a: " " in a, words))
    print("Multi-word expressions in emotion words: {}".format(",".join(mwe)))

    # Create the MWE tokenizer
    mwe_tokenizer = MWETokenizer()
    for s in mwe:
        print("Add mwe ", s)
        mwe_tokenizer.add_mwe(s.split(" "))
        sentiment_bag.add(s.replace(" ", "_"))

    lmtzr = WordNetLemmatizer()
    st = LancasterStemmer()
    ps = PorterStemmer()
    print("Stemming:")
    for word in filter(lambda a: " " not in a, words):
        print("{} => {} / {} / {}".format(word, lmtzr.lemmatize(word), st.stem(word), ps.stem(word)))
        sentiment_bag.add(word)
        sentiment_bag.add(st.stem(word))  # I like this one the best

    # Process all the lists
    for (label, files) in sorted(makecloud.TRANSCRIPTS.items()):
        scores = []
        print("{}:\n{}=".format(label, "=" * len(label)))
Exemple #26
0
def tokenize_with_mwe(text):
    mwe_tokenizer = MWETokenizer([('Republic', 'Day')])
    mwe_tokenizer.add_mwe(('Indian', 'Army'))
    return mwe_tokenizer.tokenize(text.split())
Exemple #27
0
def main():
    parser = argparse.ArgumentParser(
        description="group sentence by cooccurrence")
    parser.add_argument('--input_dir',
                        type=str,
                        default='',
                        help='autophrase parsed directory')
    parser.add_argument('--query_string',
                        type=str,
                        default='',
                        help='search query')
    parser.add_argument('--num_process',
                        type=int,
                        default=2,
                        help='number of parallel')

    args = parser.parse_args()
    query = args.query_string.split(',')
    nlp = spacy.load('en_core_web_lg', disable=['ner'])

    print(query)
    sys.stdout.flush()

    ##### sentence search #####
    start_time = time.time()

    input_dir = os.listdir(args.input_dir)
    tasks = list(split(input_dir, args.num_process))

    inputs = [(tasks[i], args) for i in range(args.num_process)]

    with Pool(args.num_process) as p:
        search_results = p.map(sent_search, inputs)

    search_merge = search_results[0]['context']
    count_merge = search_results[0]['freq']

    for pid in range(1, len(search_results)):
        tmp_context = search_results[pid]['context']
        tmp_freq = search_results[pid]['freq']
        for ent in query:
            search_merge[ent] += tmp_context[ent]
            count_merge[ent]['total'] += tmp_freq[ent]['total']
            tmp_freq[ent].pop('total', None)
            count_merge[ent].update(tmp_freq[ent])

    for ent in query:
        for index in range(len(search_merge[ent])):
            search_merge[ent][index]['doc_score'] = count_merge[ent][
                search_merge[ent][index]['did']] / count_merge[ent]['total']

    fid = 1
    for ent in query:
        with open('retrieved-{}.txt'.format(fid), "w+") as f:
            for sent in search_merge[ent]:
                f.write(json.dumps(sent) + '\n')
        f.close()
        fid += 1

    print("--- search use %s seconds ---" % (time.time() - start_time))
    sys.stdout.flush()

    start_time = time.time()
    unigrams = []
    for ent in query:
        for sent in search_merge[ent]:
            unigrams += sent['unigram']
    unigram_set = set(unigrams)

    for ent in query:
        unigram_set.discard(ent)

    unigram_sents = {}
    for ent in query:
        unigram_sents.update({ent: {}})
        for sent in search_merge[ent]:
            unigram = set(sent['unigram'])
            unigram_intersect = unigram.intersection(unigram_set)
            for item in unigram_intersect:
                if item in unigram_sents[ent].keys():
                    unigram_sents[ent][item].append(sent)
                else:
                    unigram_sents[ent].update({item: [sent]})

    score_dist = {}
    for ug in unigram_set:
        score_dist.update({ug: {}})
        for ent in query:
            score_dist[ug].update({ent: 0})
            if ug in unigram_sents[ent].keys():
                did = set()
                for sent in unigram_sents[ent][ug]:
                    score_dist[ug][ent] += sent['doc_score']
                    #if sent['did'] not in did:
                    #score_dist[ug][ent] += sent['doc_score']
                    did.add(sent['did'])

    agg_score = {}
    for ug in score_dist.keys():
        tmp_res = [item[1] for item in score_dist[ug].items()]
        agg_score.update({ug: np.mean(tmp_res) - np.std(tmp_res)})

    score_sorted = sorted(agg_score.items(), key=lambda x: x[1], reverse=True)

    print("--- unigram score %s seconds ---" % (time.time() - start_time))
    sys.stdout.flush()

    ### phrase hard match ###

    start_time = time.time()
    mined_phrases = {}
    for ent in query:
        mined_phrases.update({ent: []})
        for sent in search_merge[ent]:
            mined_phrases[ent] += sent['phrases']

    coo_phrases = set(mined_phrases[query[0]])
    for ent in query:
        coo_phrases = coo_phrases.intersection(set(mined_phrases[ent]))

    print(coo_phrases)
    print(mined_phrases)

    tokenizer = MWETokenizer(separator=' ')

    for ph in coo_phrases:
        tokenizer.add_mwe(nltk.word_tokenize(ph))

    search_refetch = {}
    for ent in query:
        search_refetch.update({ent: []})
        for sent in search_merge[ent]:
            sent_tok = nltk.word_tokenize(sent['text'])
            raw_tokenized = tokenizer.tokenize(sent_tok)
            tokenized_set = set(raw_tokenized)
            mentioned_phrase = list(tokenized_set.intersection(coo_phrases))
            if len(mentioned_phrase) != 0:
                sent['phrases'] = mentioned_phrase
                search_refetch[ent].append(sent)

    mined_phrases = {}
    fid = 0
    for ent in query:
        ent_phrase = []
        phrase_cnt = Counter()
        mined_phrases.update({ent: []})
        for sent in search_refetch[ent]:
            mined_phrases[ent] += sent['phrases']
            phrase_cnt.update(sent['phrases'])

        for ent in query:
            phrase_cnt.pop(ent, None)
        with open('phrase-mined-{}.txt'.format(fid), "w+") as f:
            for pair in sorted(phrase_cnt.items(), key=lambda kv: kv[1]):
                f.write('{} {} \n'.format(pair[0], pair[1]))
        f.close()
        fid += 1

    coo_phrases = set(mined_phrases[query[0]])
    for ent in query:
        coo_phrases = coo_phrases.intersection(set(mined_phrases[ent]))

    print('number of cooccurred phrase: ', len(coo_phrases))
    print("--- phrase eval use %s seconds ---" % (time.time() - start_time))
    sys.stdout.flush()

    start_time = time.time()
    phrase_sents = {}
    for ent in query:
        phrase_sents.update({ent: {}})
        for sent in search_refetch[ent]:
            interc = set(sent['phrases']).intersection(coo_phrases)
            for item in interc:
                if item in phrase_sents[ent].keys():
                    phrase_sents[ent][item].append(sent)
                else:
                    phrase_sents[ent].update({item: [sent]})

    fid = 1
    for ent in query:
        with open('phrase-sent-dist-{}.txt'.format(fid), "w+") as f:
            for key, value in phrase_sents[ent].items():
                meta = {key: len(value)}
                f.write(json.dumps(meta) + '\n')
        f.close()
        fid += 1
Exemple #28
0
class LyricsCleaner:
    """cleans and tokenizes the from a song lyrics in preparation for embedding"""

    def __init__(self, filename):
        """initializes a LyricsCleaner object"""
        self._filename = filename
        self._tokenizer = MWETokenizer()
        for word in SIGNAL_WORDS:
            self._tokenizer.add_mwe(('[', word, ']'))
        self._stemmer = LancasterStemmer()

    def tokenizeSong(self):
        """breaks up the lyrics into tokens using the nltk tokenizer, stemming,
        and various normalization techniques"""
        with open(NAMES_CSV) as nameFile:
            read = csv.reader(nameFile, delimiter=",")
            names = []
            for name in read:
                names.append(name[0])
        with open(self._filename) as songJSON:
            rawData = json.load(songJSON)
            #get the lyrics from the json file
            lyrics = rawData["songs"][0]["lyrics"]
            if not lyrics == None:
                #preserve the newline for prediction, want to predict the newline character
                preserveNewline = lyrics.replace("\n", " **NEWLINE** ")
                #tokenize the lyrics
                tokenizedLyrics = nltk.word_tokenize(preserveNewline)
                #replace people's names with general name token
                for k in range(len(tokenizedLyrics)):
                    if tokenizedLyrics[k] in names:
                        tokenizedLyrics[k] = "**NAME_VAR**"
                    #NOT DOING THIS ANYMORE: take out words that are not english
                    #else:
                    #    for h in range(len(tokenizedLyrics[k])):
                    #        if not tokenizedLyrics[k][h] in string.printable and len(tokenizedLyrics[k]) > 1 and tokenizedLyrics[k][h] != "…":
                    #            if h != len(tokenizedLyrics[k]):
                    #                print(tokenizedLyrics[k] + " ==> " + tokenizedLyrics[k][h])
                    #                tokenizedLyrics[k] = "**NOT_ENLGISH**"
                #bring the multi-word expressions back together ([CHOURS], [VERSE], etc)
                tokenizedLyrics = self._tokenizer.tokenize(tokenizedLyrics)
                #add start token
                newLyrics = ['START']

                #normalize the labels for the parts of the song
                i = 0
                while i < len(tokenizedLyrics):
                    word = tokenizedLyrics[i]
                    if word == "[":
                        if tokenizedLyrics[i + 1] in SIGNAL_WORDS:
                            j = i + 2
                            while tokenizedLyrics[j] != "]" and j < len(tokenizedLyrics) - 1:
                                j += 1
                            word = word + "_" + tokenizedLyrics[i+1] + "_" + tokenizedLyrics[j]
                            newLyrics += [word.lower()]
                            i = j

                    #if word is not a stopword, keep it
                    elif word not in nltk.corpus.stopwords.words("english"):
                        if not word[2:len(word)-2] == SIGNAL_WORDS:
                            #make everything lowercase because capitalization doesn't really matter in songs?
                            #add the stem
                            newLyrics += [self._stemmer.stem(word.lower())]
                            if word.lower() != self._stemmer.stem(word.lower()):
                                #if stem is same as original word except for last letter in original
                                if word.lower()[:len(word)-1] == self._stemmer.stem(word.lower()):
                                    #add the last letter in the original word
                                    newLyrics += word.lower()[len(word)-1:]
                                #if stem is same as original word except for last two letters in original
                                elif word.lower()[:len(word)-2] == self._stemmer.stem(word.lower()):
                                    #add the last two letters in the original word
                                    newLyrics += word.lower()[len(word)-2:]
                                #if stem is same as original word except for last three letters in original
                                elif word.lower()[:len(word)-3] == self._stemmer.stem(word.lower()):
                                    #add the last three letters in the original word
                                    newLyrics += word.lower()[len(word)-3:]
                                #if stem is like once or since the stem is "ont" or "sint"
                                elif word.lower()[len(word)-3:len(word)-1] == "ce" and self._stemmer.stem(word.lower())[len(self._stemmer.stem(word.lower()))-1] == "t":
                                    #add the "ce" as a token
                                    newLyrics += word.lower()[len(word)-3:len(word)-1]
                    i += 1
                                    #print(word.lower()[len(word)-3:len(word)-1])
                #add end token to the end of a song
                newLyrics += ['END']
                return newLyrics
Exemple #29
0
from nltk.tokenize import TweetTokenizer, MWETokenizer
import nltk.corpus as corpus

# Twitter Tokenizer
tweettk = TweetTokenizer()
# Multi-Word Tokenizer
mwetk = MWETokenizer()
# Load movie titles into Multi-Word Tokenizer
movie_titles = []
with open('movies.json', 'r') as file:
    movie_titles = [movie['name'].split() for movie in json.load(file)]
for title in movie_titles:
    # Movies that are one word don't need to be included
    if len(title) < 2:
        continue
    mwetk.add_mwe(tuple(title))
    # Include lowercased version as well
    mwetk.add_mwe(tuple([s.lower() for s in title]))

stop_words = set(corpus.stopwords.words('english'))
printable = set(string.printable)


def tokenizeTweets(tweets):
    for tweet in tweets:
        # Remove any non-ASCII chars
        text = ''.join([x for x in tweet['text'] if x in printable])
        text = tweettk.tokenize(text)
        text = mwetk.tokenize(text)
        text = [token for token in text if token not in stop_words]
        tweet['original_text'] = tweet['text']
# 4. Whitespace Tokenizer
# 5. Word Punct Tokenizer

# 1. Tweet tokenizer
from nltk.tokenize import TweetTokenizer

tweet_tokenizer = TweetTokenizer()
tweet_tokenizer.tokenize(sentence5)

# 2. MWE Tokenizer (Multi-Word Expression)
from nltk.tokenize import MWETokenizer

mwe_tokenizer = MWETokenizer([
    ('Republic', 'Day')
])  # Declaring set of words that are to be treated as one entity
mwe_tokenizer.add_mwe(('Indian', 'Army'))  # Adding more words to the set

mwe_tokenizer.tokenize(
    sentence5.split()
)  #  Indian Army' should be treated as a single token. But here "Army!" is treated as a token.

mwe_tokenizer.tokenize(sentence5.replace(
    '!', '').split())  # "Army!" will be treated as Army

# 3. Regexp Tokenizer
from nltk.tokenize import RegexpTokenizer

reg_tokenizer = RegexpTokenizer('\w+|\$[\d\.]+|\S+')
reg_tokenizer.tokenize(sentence5)

# 4. Whitespace Tokenizer
Exemple #31
0
print(phrases)
print("=" * 20)

#totenize words
words = word_tokenize(text)
print(words)
print("=" * 20)

#pos tagging words
word_tags = nltk.pos_tag(words)
print(word_tags)
print("=" * 20)

#select specified words (proper noun here)
for word_tag in word_tags:
    if word_tag[1] == 'NNP':
        print(word_tag[0])

# import MWETokenizer() method from nltk
from nltk.tokenize import MWETokenizer

mwe = MWETokenizer()

# Create a string input
mwe.add_mwe(('All', 'work', 'and'))
mwe.add_mwe(('New', 'York'))

# tokenize witg mwe
mwe_words = mwe.tokenize(words)
print(mwe_words)
Exemple #32
0
from spacy.en import English
nlp = English()


# This is for multi-word-phrases. 
MWE = [] 
path = "/".join(os.path.realpath(__file__).split("/")[:-2]) + '/input/'
print 'path', path
with open(path+'STREUSLE2.1-mwes.tsv') as f:
    for line in f.readlines():
        multiword_expression = line.split('\t')[0].split()[1:]
        MWE.append(multiword_expression)
MWE_tokenizer = MWETokenizer(MWE, separator='-')
# Add whatever additional custom multi-word-expressions.
MWE_tokenizer.add_mwe((  'dive', 'bar'))
# Stopwords
stops = set(stopwords.words("english") + stopwords.words("spanish"))
keep_list = ['after', 'during', 'not', 'between', 'other', 'over', 'under', 
             'most', ' without', 'nor', 'no', 'very', 'against','don','aren']
stops = set([word for word in stops if word not in keep_list])


table = string.maketrans("","")



sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
whitespace_tokenizer = WhitespaceTokenizer()