def viterbi_segment(longstring):
    """
    INPUT:
        text: str, text that need to be segmented
        corpusPath: path of corpus file used to generate a voucabulary
    OUTPUT:
        segmented token: str,  original longstring segmented

    EXAMPLE:
    >>>viterbi_segment('myhomeawayfromhom')
    'my home away from ho m'
    """
    #longstring = re.sub(r'-', '',longstring)
    longstring = re.sub(r'\.|[0-9]|_|-', '', longstring)
    probs, lasts = [1.0], [0]
    for i in range(1, len(longstring) + 1):
        prob_k, k = max((probs[j] * word_prob(longstring[j:i]), j)
                        for j in range(max(0, i - max_word_length), i))
        probs.append(prob_k)
        lasts.append(k)
    words = []
    i = len(longstring)
    while 0 < i:
        words.append(longstring[lasts[i]:i])
        i = lasts[i]
    words.reverse()
    segmented_token = ' '.join(words)
    return segmented_token
Example #2
0
def splitTrim(line):
  words = []
  temp = re.findall(r'\w+', line)
  for i in temp:
    if len(i)>2:
      words.append(i)
  #print(words)
  return words
def get_topic_words(ldamodel, num_topics=num_topics):
    words = []
    for i in range(0, num_topics):
        topics = ldamodel.show_topic(i)
        for topic in topics:
            words.append(topic[0])

    return words
Example #4
0
def get_words_from_keys(keys):
    '''Helper function for filtering stopwords and non-english words from a list of keys.
    Returns a list of strings.'''
    words = []
    for key in keys:
        if key not in nltk_stopwords and key in nltk_words:
            words.append(key)
    return words
Example #5
0
def list_maker(data):
    """
    Make a list from read-in file lines. Return a list
    """
    words = []
    for i in range(35, len(data)):
        words.append(data[i].rstrip('\n'))
    return words
Example #6
0
def segment(text, segs):
    words = []
    last = 0
    for i in range(len(segs)):
        if segs[i] == '1':
            words.append(text[last:i + 1])
            last = i + 1
    words.append(text[last:])
    return words
Example #7
0
    def _get_all_words_from_node(self, node, curr_word, words):
        if not node:
            words.append(''.join(curr_word))
            return

        curr_word.append(node.value)
        for child in node.children.values():
            self._get_all_words_from_node(child, curr_word, words)
        curr_word.pop()
Example #8
0
def WordTokenize(namafile):
    # fungsi ini mengembalikan array of word dari dokumen dengan nama namafile
    words = []
    with open(namafile, 'r') as f:
        for line in f:
            for word in line.split():
                if (word != '' and word != ' '):
                    words.append(clean(word))
    return words
Example #9
0
def brute(string, length, charset):
    global words
    if len(string) == length:
        return
    for char in charset:
        temp = string + char
        words.append(temp)
        brute(temp, length, charset)
    return (words)
Example #10
0
def STokenWord(namafile):  #namafile: string.txt
    # return kata-kata yang unik dan udah distem dan tanpa stopwords
    words = []
    with open(namafile, 'r') as f:
        for i in f:
            kata = word_tokenize(i)
            for j in kata:
                if (not (ps.stem(j) in words) and not (j in stop_words)):
                    words.append(ps.stem(j))
    return words
Example #11
0
def split_word(content):
    REG_EXPR = "'?([_-a-zA-z0-9']+)'?"

    pattern = re.compile(r'{}'.format(REG_EXPR))
    matches = pattern.finditer(content)
    words = []

    for match in matches:
        words.append(match.group(0))

    return words
Example #12
0
def STokenWord(namafile):  #namafile: string.txt
    # return kata-kata yang unik dan udah distem dan tanpa stopwords
    words = []
    with open(namafile, 'r') as f:
        for line in f:
            for word in line.split():
                if (clean(word) not in words) and (word not in stop_words):
                    words.append(clean(word))
    if ('' in words):
        words.remove('')
    if (' ' in words):
        words.remove(' ')
    return words
Example #13
0
def words_and_not_words(xs, ws):
    flwD = {}
    for w in ws:
        flwD[w] = w

    words, not_words = [], []
    for x in xs:
        try:
            words.append(flwD[x])
        except KeyError:
            not_words.append(x)

    return words, not_words
Example #14
0
    def get_features_from_chunk(self, chunks_tree):
        if chunks_tree.height() > 3:
            raise Exception("Chunk tree is too deep to parse!")
        words = []
        labels = []
        for chunk in chunks_tree:
            if isinstance(chunk, nltk.Tree):
                for b in chunk:
                    words.append(b[0])
                    labels.append('BIO')
            else:
                words.append(chunk[0])
                labels.append('O')

        feats = self.fast_calculate_features(words)
        return zip(feats, labels)
Example #15
0
def topTen(classDict, classCount):
  words = []
  probs = []
  #keys = word, vals = P(word|class)
  keys = list(classDict.keys())
  vals = list(classDict.values())
  pClass = classCount/allCount
  for i in range(len(keys)):
    pWord = allProb[(keys[i])]
    p = (vals[i]*pClass)/pWord
    words.append(keys[i])
    probs.append(p)
  top = sorted(range(len(probs)), key=lambda i: probs[i])[-10:]
  topWords = []
  for i in top:
    topWords.append(words[i])
  return topWords
Example #16
0
def text_cleaning(docs):
    documents = {}
    documents_without_verbs = {}

    for doc in docs:
        words = []
        for word in docs[doc]:
            word = word.lower()
            word.replace("|", "").replace("\\", "").replace("!", "").replace(
                "\"",
                "").replace("£", "").replace("$", "").replace("%", "").replace(
                    "&", "").replace("", "").replace("(", "").replace(
                        ")", "").replace("=", "").replace("?", "").replace(
                            "^", "").replace(",", "").replace(".", "").replace(
                                "@",
                                "").replace("#",
                                            "").replace("\'",
                                                        "").replace("~", "")
            if (word not in stopwords.words('english')) and (
                    word not in punctuation_list) and (len(word) > 1) and (
                        not word.isdigit()) and ("//" not in word):
                #TODO LEMMATIZATION
                words.append(word)
        documents[doc] = words

        #bigram_fd = nltk.FreqDist(nltk.bigrams(words))
        #words è la lista di parole da modificare
        #document_tagged = nltk.pos_tag(words)
        #PRENDO SOSTANTIVI E AGGETTIVI SENZA VERBI E FACCIO TFIDF CON QUELLI
        #document_tagged = [(x,y) for (x,y) in document_tagged if (y in ('VB', 'NN', 'NNS', 'NNP', 'NNPS','VBD', "VBG", "VBN", "VBP", "VBZ"))or ("_" in x) ]

        #print (document_tagged)

    f = open("./data_structures/for_test_noverbs.json", "w+")
    json_data = json.dumps(documents_without_verbs)
    f.write(json_data)
    f.close()

    f = open("./data_structures/for_test.json", "w+")
    json_data = json.dumps(documents)
    f.write(json_data)
    f.close()

    return documents
def callViterbi(text):
    scoreList, lasts = [1.0], [0]
    for i in range(1, len(text) + 1):
        prob_k = 0
        k = 0
        for j in range(max(0, i - maximumlength), i):
            maximumProbability = scoreList[j] * (dictionary[text[j:i]] / total)
            if (maximumProbability > prob_k):
                prob_k = maximumProbability
                k = j
        scoreList.append(prob_k)
        lasts.append(k)
    words = []
    i = len(text)
    while 0 < i:
        words.append(text[lasts[i]:i])
        i = lasts[i]
    words.reverse()
    return words
Example #18
0
def load_event_ontology(file):
    input_file = open(file, "r").readlines()
    phase_category2keywords = {}
    expression2words = {}
    for line in input_file:
        if not line.strip():
            continue
        if line[0] == "@":
            words = []
            expression = line.split()[0]
            phrases = " ".join(line.split()[1:]).split(",")
            for phrase in phrases:
                words.append(" ".join(phrase.split()).lower())
            expression2words[expression] = words
    for line in input_file:
        if not line.strip():
            continue
        words = line.split()
        if words[0] == "#":
            phase = words[1]
            phase_category2keywords[phase] = {}
            continue
        if words[0] == "##":
            category = "_".join(words[1:])
            phase_category2keywords[phase][category] = []
            continue
        if words[0] == "###":
            phrases = line.replace("###", "").split(",")
            for phrase in phrases:
                expression_flag = False
                for word in phrase.split():
                    if "@" + word in expression2words:
                        expression_flag = True
                        for expression_word in expression2words["@" + word]:
                            final_phrase = " ".join(phrase.split()).replace(
                                word, expression_word)
                            phase_category2keywords[phase][category].append(
                                final_phrase.lower())
                if expression_flag == False:
                    phase_category2keywords[phase][category].append(" ".join(
                        phrase.split()).lower())
            continue
    return phase_category2keywords
Example #19
0
def separator(chars, exclude=None):
    words = []
    if not chars.isalpha():
        return [chars]
    if not exclude:
        exclude = set()
    working_chars = chars
    while working_chars:
        for i in range(len(working_chars), 1, -1):
            segment = working_chars[:i]
            if check(segment) and segment not in exclude:
                words.append(segment)
                working_chars = working_chars[i:]
                break
        else:
            if words:
                exclude.add(words[-1])
                return separator(chars, exclude=exclude)
            return [chars]
    return words
def updateFilterWords():
    df = pd.read_excel("./Resource/filterCandidiate.xlsx",
                       index_col=None,
                       header=None)
    allWords = list(set(df[0]))
    words = []
    #filter out null
    for w in allWords:
        try:
            if len(w) > 1:
                words.append(w)
        except:
            continue
    len(words)
    wordsNeed2Filter = [df[0][i] for i in range(len(df[0])) if df[1][i] == 1]
    wordsNeed2Filter += ["IEEE", "ON", "DOWN", "THIS", "THAT"]
    wordsNeed2Filter += [chr(i) for i in range(97, 123)]
    wordsNeed2Filter += [chr(i) for i in range(65, 91)]
    wordsNeed2Filter = set(wordsNeed2Filter)
    pickle.dump(wordsNeed2Filter, open("wordsNeedToBeFiltered.dat", "wb"))
Example #21
0
def text_files_to_wordbags():
    count = 0
    files = os.listdir(directory)
    files.sort()

    words = []
    titles = []

    for filename in files:
        if count < max_books:
            if '.txt' in filename:
                count += 1
                book_no = filename.split(' -- ')[0]
                book_title = filename.split(' -- ')[1].strip('.txt')
                titles.append(book_title)
                print('\n' + book_title)

                text = file_to_text(directory + '/' + filename)
                lemmas = text_to_lemmas(text)
                words.append(lemmas)
    return words, titles
Example #22
0
def prediction_data_preparation(test_df, le, thresh):
    '''
    ANALYSIS OF MODEL PREDICTION WITH CONSIDERATION OF PROBABILITY
    SELECTING PREDICTED WORDS BASED ON PROBABILITY THRESHOLD
    
    test_df = dataframe with test prediction and probability
    test_df is transformed into words and '_' is replaces with '' for better interpretability
    
    *function ensures that chosen words (probability sentences) contain the same number of words as model prediction
    '''

    from tqdm import tqdm

    test_df_transformed = test_df.copy()

    transform_columns = [
        '5_ae', '5_a', '4_iaebglebg', '4_iabglbg', '3_mdagmlg', '2_kl',
        '3_mdagrmlg', '2_ka', 'REF'
    ]

    for columns in transform_columns:
        test_df_transformed[columns] = le.inverse_transform(
            test_df_transformed[columns])

    words = []
    test_df_transformed['selected_words'] = None

    test_df_transformed['REF'] = test_df_transformed['REF'].replace('_', '')

    for row in tqdm(range(test_df_transformed.shape[0])):
        if test_df_transformed['probability'][row] > thresh:
            words.append(test_df_transformed['REF'][row])
        else:
            words.append('_')

    test_df_transformed['selected_words'] = words
    test_df_transformed.loc[test_df_transformed['REF'] == '',
                            'selected_words'] = ''

    return test_df_transformed
Example #23
0
def lemmatization(docs):
    documents = {}
    for doc in docs:
        words = []
        document_tagged = nltk.pos_tag(docs[doc])
        #PRENDO SOSTANTIVI E AGGETTIVI SENZA VERBI E FACCIO TFIDF CON QUELLI
        document_tagged = [
            (x, y) for (x, y) in document_tagged
            if (y in ("JJ", 'JJR', 'JJS', 'VB', 'NN', 'NNS', 'NNP', 'NNPS',
                      'VBD', "VBG", "VBN", "VBP", "VBZ")) or ("_" in x)
        ]

        for word, tag in document_tagged:
            if tag.startswith("NN"):
                lemmatized_word = lemmatizer.lemmatize(word, pos="n")
                words.append(lemmatized_word)
            if tag.startswith("VB"):
                lemmatized_word = lemmatizer.lemmatize(word, pos="v")
                words.append(lemmatized_word)
            if tag.startswith("JJ"):
                lemmatized_word = lemmatizer.lemmatize(word, pos="a")
                words.append(lemmatized_word)
            else:
                print("left out: ", word)
        documents[doc] = words

    f = open("./data_structures/for_test_lemmas.json", "w+")
    json_data = json.dumps(documents)
    f.write(json_data)
    f.close()
    print(documents)
Example #24
0
def generate_keywords(messages, n=8):
    sentences = read_messages(messages)
    stop_words = stopwords.words('english')
    words = []
    common_word_list = create_common_word_list(messages)
    for sentence in sentences:
        sentence_words = word_tokenize(sentence)
        for i in range(len(sentence_words)):
            sentence_words[i] = sentence_words[i].lower()
        words.append(sentence_words)
    flat_words = [item for sublist in words for item in sublist]
    final_wordlist = []
    for flat_word in flat_words:
        if flat_word not in stop_words and len(flat_word) > 3:
            final_wordlist.append(flat_word)
    freq = Counter(final_wordlist)
    common_keywords = (freq.most_common(n))
    final_words = []
    for c in common_keywords:
        if c[0] not in common_word_list:
            final_words.append(c[0])
    return final_words
Example #25
0
def attempt_words(word):
    '''Return a list of words that may or may not be real'''
    # If 0 stands for 'use letter before' and 1 'use letter
    # after', then the numbers from 0 to 2**n (where n is
    # the length of a string) represent all possible
    # translations, **if** each number is zero-padded to n places
    # (e.g. "000" instead of "0" for n=3.
    n = len(word)
    options = [bin(i)[2:].zfill(n) for i in range(2**n)]
    words = []
    for i, option in enumerate(options):
        new_word = ''
        for j, c in enumerate(word):
            choice = option[j]
            k = ABC.find(c)
            if choice == '0':
                new_c = ABC[k - 1]
            else:
                # Wrap around if go past right end of list
                new_c = ABC[(k + 1) % len(ABC)]
            new_word += new_c
        words.append(new_word)
    return words
Example #26
0
def findpattern(Apps, Revs):

    App = []
    for i in Revs:
        App.append(i[3])
    App = list(set(App))
    #print "Len App",len(App)
    dic = {}
    for i, j in enumerate(App):
        dic[j] = i
    NumApp = len(App)
    for i in range(0, len(Revs)):
        for j in range(0, len(Revs[i])):
            if Revs[i][j] == "NULL":
                Revs[i][j] = ""

    # <codecell>

    result1 = codecs.open("dis1.txt", "w", "utf-8")
    pairs1 = np.zeros((NumApp, NumApp), dtype=np.int)
    resultApp = codecs.open("output.txt", "w", "utf-8")
    ####################################################
    nlp = spacy.en.English()
    count = 0
    '''with codecs.open("sample.csv","r","latin-1") as f:
		arr = [i.rsplit("\n")[0] for i in f]
		arr = [i.rsplit("\t") for i in arr]'''
    c = 0
    for rev in Revs:
        #print count
        count = count + 1
        title = rev[0]
        body = rev[1]
        r = title + u"," + body
        #r = title + u" , " +body
        AppID = rev[3]
        words = []
        #print r,type(r)
        doc = nlp(r)
        for chunk in doc.noun_chunks:
            #print "NNP" , chunk.orth_
            words.append(chunk.orth_)
        for token in doc:
            if token.tag_ == "NN" or token.tag_ == "NNS" or token.tag_ == "NNP" or token.tag_ == "NNPS":
                #print "NN",token
                words.append(token.orth_)

        for i, j in enumerate(words):
            j = j.lower()
            if j.endswith(' app'):
                words.append(j[:-4])
            if j.endswith(' game'):
                words.append(j[:-5])
            if j.startswith('the '):
                words.append(j[4:])

        found1 = False
        found2 = False
        Ans = "NULL"

        for i in words:

            for j in Apps:
                #if j[1] != AppID:
                #dist = distance(j[0],i.lower())
                if AppID.find(i) != -1:
                    continue
                try:
                    if j[0] == i.lower() and not found1:
                        result1.write(i + "\t" + title + "\t" + body + "\t" +
                                      j[1] + "\t" + AppID + "\n")
                        # writing as [Source][Target] == from source to target
                        pairs1[dic[j[1]],
                               dic[AppID]] = pairs1[dic[j[1]], dic[AppID]] + 1
                        found1 = True
                        Ans = j[1]
                        #print "Found",j[1],j[0],i
                        c = c + 1

                except Exception as e:
                    #print "\error","  ",str(e)
                    #print AppID,"    ",j[1]
                    p = 0
                if found1:
                    break
            if found1:
                break
        resultApp.write(rev[0] + "\t" + rev[1] + "\t" + rev[2] + "\t" +
                        rev[3] + "\t" + rev[4] + "\t" + Ans + "\n")
    print 'total found', c
    resultApp.close()
Example #27
0
 def _replace(self, sentence, is_spell_check=True):
     words = []
     for word in word_tokenize(sentence):
         word = word.strip()
         if "/" in word or "\\" in word:
             words.append("__isslashinword__")
         elif self.word_re.match(word):
             if is_spell_check and word not in self.all_words:
                 words.append(self.spell_checker.correction(word))
             else:
                 words.append(word)
         elif self.number_re.match(word):
             words.append("__isnumber__")
         elif "__isurl__" in word:
             words.append("__isurl__")
         else:
             words.append("__isinvalidword__")
     return words

#PRINTING FINAL WORD LIST
#print(new)


#CREATING THE CLUSTER
words_orgn = word_list
final_word_list=[]
for i in range(0,len(new)):
    if new[i] in word_list:
        final_word_list.append(new[i])

words = final_word_list
for i in range(0,len(new)):
    words.append(new[i])
words = np.asarray(words)
lev_similarity = -1*np.array([[distance.levenshtein(w1,w2) for w1 in words] for w2 in words])
affprop = sklearn.cluster.AffinityPropagation(affinity="precomputed", damping=0.80)
affprop.fit(lev_similarity)


data=[]
for cluster_id in np.unique(affprop.labels_):
    exemplar = words[affprop.cluster_centers_indices_[cluster_id]]
    cluster = np.unique(words[np.nonzero(affprop.labels_ == cluster_id)])
    for i in range(0, len(cluster)):
        if cluster[i] in words_orgn:
            for j in range(0, len(new)):
                if Levenshtein.ratio(cluster[i], new[j]) > 0.5:
                    print(cluster[i])
 def load_search_words(self):
     words = []
     with open(self.search_word_file_path, 'r') as f:
         for line in f:
             words.append(line.strip())
     return words
Example #30
0
for word in vocab:
    #print word
    """
	new_word=''
	if len(word)>2:
		for j in range(0,len(word)-2):
			if word[j]==word[j+1] and word[j]==word[j+2]:
				continue
			else: 
				new_word=new_word+word[j]
	word=new_word
	"""
    word = word.lower()
    p = fl.english_g2p(word)
    if p not in phenomes:
        words.append(word)
        phenomes.append(p)
        conversion[word.lower()] = word.lower()
    else:
        i = phenomes.index(p)
        if words[i].lower() not in word_corpus_nltk:
            temp = words[i]
            words[i] = word
            conversion[word.lower()] = word.lower()
            conversion[temp.lower()] = word.lower()
        else:
            conversion[word.lower()] = words[i].lower()

f = open('Final_CB_DS/finalcb_dataset_cleaned.txt', 'r')
g = open('Final_CB_DS/sample_phonetic_ipa_vocab.txt', 'w')