Python tokenize Exemples, utils.tokenize Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : 0__glove_predict.py Projet : Sirorezka/a-l-l-e-n-_-m-a-s-t-_-r

def predict_answers(data, word2vec, N):

    stop = stopwords.words('english')

    pred_answs = []
    pred_probs = [["A", "B", "C", "D"]]
    for i in range(data.shape[0]):
        #calculate word2vec for question
        q_vec = np.zeros(N, dtype=float)
        for w in tokenize(data['question'][i]):
            if w.lower() in word2vec and w.lower() not in stop:
                w2 = getword2vecval (N,w.lower(),word2vec)
                q_vec = np.add(q_vec, w2)
        q_vec = q_vec / linalg.norm(q_vec)
    
        #calculate word2vec for answers
        A_vec = np.zeros(N, dtype=float)
        B_vec = np.zeros(N, dtype=float)
        C_vec = np.zeros(N, dtype=float)
        D_vec = np.zeros(N, dtype=float)
        for w in tokenize(data['answerA'][i]):
            if w.lower() in word2vec  and w.lower() not in stop:
                w2 = getword2vecval (N,w.lower(),word2vec)
                #print (w2[0:4])
                A_vec = np.add(A_vec,w2)
    
        for w in tokenize(data['answerB'][i]):
            if w.lower() in word2vec and w.lower() not in stop:
                w2 = getword2vecval (N,w.lower(),word2vec)
                #print (w2[0:4])
                B_vec = np.add(B_vec,w2)
            
        for w in tokenize(data['answerC'][i]):
            if w.lower() in word2vec and w.lower() not in stop:
                w2 = getword2vecval (N,w.lower(),word2vec)
                #print (w2[0:4])
                C_vec = np.add(C_vec,w2)

    
        for w in tokenize(data['answerD'][i]):
            if w.lower() in word2vec and w.lower() not in stop:
                w2 = getword2vecval (N,w.lower(),word2vec)
                #print (w2[0:4])
                D_vec = np.add(D_vec,w2)
    
        A_vec = A_vec / linalg.norm(A_vec) 
        B_vec = B_vec / linalg.norm(B_vec)
        C_vec = C_vec / linalg.norm(C_vec)
        D_vec = D_vec / linalg.norm(D_vec)
        
        #choose question based on cosine distance
        idx = np.concatenate((A_vec, B_vec, C_vec, D_vec)).reshape(4, N).dot(q_vec).argmax()
        probs = np.concatenate((A_vec, B_vec, C_vec, D_vec)).reshape(4, N).dot(q_vec)
        pred_answs.append(["A", "B", "C", "D"][idx])
        pred_probs.append(probs)
        
    return pred_answs, pred_probs

Exemple #2

0

Afficher le fichier

Fichier : glove_predict.py Projet : johnkorn/kaggle_allen

def get_glove_features(data, word2vec, N):
    stop = stopwords.words('english')

    scores = []
    for i in range(data.shape[0]):
        #calculate word2vec for question
        q_vec = np.zeros(N)
        for w in tokenize(data['question'][i]):
            if w.lower() in word2vec and w.lower() not in stop:
                q_vec += word2vec[w.lower()]
                
#                 # get all synonyms of the word
#                 syns = wn.synsets(w.lower(), pos='n')
#                 if len(syns)>0:
#                     for syn in syns:
#                         sw = syn.lemma_names()[0]
#                         if sw.lower() in word2vec and sw.lower() not in stop:
#                             q_vec += word2vec[sw.lower()]
        
        q_vec = q_vec / linalg.norm(q_vec)
    
        #calculate word2vec for answers
        A_vec = np.zeros(N)
        B_vec = np.zeros(N)
        C_vec = np.zeros(N)
        D_vec = np.zeros(N)
        for w in tokenize(data['answerA'][i]):
            if w.lower() in word2vec  and w.lower() not in stop:
                A_vec += word2vec[w.lower()]
        
    
        for w in tokenize(data['answerB'][i]):
            if w.lower() in word2vec and w.lower() not in stop:
                B_vec += word2vec[w.lower()]
        
            
        for w in tokenize(data['answerC'][i]):
            if w.lower() in word2vec and w.lower() not in stop:
                C_vec += word2vec[w.lower()]
        
    
        for w in tokenize(data['answerD'][i]):
            if w.lower() in word2vec and w.lower() not in stop:
                D_vec += word2vec[w.lower()]
                
    
        A_vec = A_vec / linalg.norm(A_vec) 
        B_vec = B_vec / linalg.norm(B_vec)
        C_vec = C_vec / linalg.norm(C_vec)
        D_vec = D_vec / linalg.norm(D_vec)
                
        scores.append(np.concatenate((A_vec, B_vec, C_vec, D_vec)).reshape(4, N).dot(q_vec))
        
    return scores

Exemple #3

0

Afficher le fichier

Fichier : lesk.py Projet : finiteautomata/wisdom

def __build_dictionary(synset, hyperhypo):
    lesk_dictionary = []
    # Includes definition.
    lesk_dictionary+= tokenize(synset.definition)
    # Includes lemma_names.
    lesk_dictionary+= synset.lemma_names
    # Optional: includes lemma_names of hypernyms and hyponyms.
    if hyperhypo:
        related_senses = synset.hypernyms()+synset.hyponyms()
        for related_sense in related_senses:
            lesk_dictionary+= tokenize(related_sense.definition)
            lesk_dictionary+= [lemma.name for lemma in related_sense.lemmas]

    without_stop_words = filter(lambda word: word not in english_stopwords , lesk_dictionary)
    return map(lambda word: word.lower(), without_stop_words)

Exemple #4

0

Afficher le fichier

Fichier : execute.py Projet : codekansas/citation-generator

def generate_citations(lines, vocab, index):
    word2idx = dict([(v, k) for k, v in enumerate(vocab)])
    for line in lines[:100]:
        tokenized = list()
        capitalized = list()
        for word, cap in zip(utils.tokenize(line, periods=True), utils.tokenize(line, periods=True, capitalized=True)):
            if word == '.':
                if len(tokenized) > 10:
                    citation = generate_citation([word2idx[w] for w in tokenized if w in word2idx], index)
                    print(' '.join(capitalized) + ' (%s).' % citation)
                tokenized = list()
                capitalized = list()
            else:
                tokenized.append(word)
                capitalized.append(cap)

Exemple #5

0

Afficher le fichier

Fichier : doc2vecpredict.py Projet : Evanc123/allen_ai

def predict_segmented_tf_idf(data, docs_per_q, ids_and_categories):  
    #index docs
    
    
    res = []
    category_tf_idfs = {}
    for index, row in data.iterrows():


    	current_id = str(row['id'])
    	print current_id
    	current_category = ids_and_categories[current_id]

    	if category_tf_idfs.get(current_category) is None:
    		category_tf_idfs[current_category] = utils.get_docstf_idf(wiki_docs_dir + '/%s' % current_category)

    	docs_tf, words_idf = category_tf_idfs[current_category]

        #get answers words
        w_A = set(utils.tokenize(row['answerA']))
        w_B = set(utils.tokenize(row['answerB']))
        w_C = set(utils.tokenize(row['answerC']))
        w_D = set(utils.tokenize(row['answerD']))
    
        sc_A = 0
        sc_B = 0
        sc_C = 0
        sc_D = 0
    
        q = row['question']
        
        for d in zip(*utils.get_docs_importance_for_question(q, docs_tf, words_idf, docs_per_q))[0]:
            for w in w_A:
                if w in docs_tf[d]:
                    sc_A += 1. * docs_tf[d][w] * words_idf[w] # count of how many times in the document, times log(numberofdocs/word) for each word
            for w in w_B:
                if w in docs_tf[d]:
                    sc_B += 1. * docs_tf[d][w] * words_idf[w]
            for w in w_C:
                if w in docs_tf[d]:
                    sc_C += 1. * docs_tf[d][w] * words_idf[w]
            for w in w_D:
                if w in docs_tf[d]:
                    sc_D += 1. * docs_tf[d][w] * words_idf[w]

        res.append(['A','B','C','D'][np.argmax([sc_A, sc_B, sc_C, sc_D])])
        
    return res

Exemple #6

0

Afficher le fichier

Fichier : cli.py Projet : NYTimes/ingredient-phrase-tagger

    def matchUp(self, token, ingredientRow):
        """
        Returns our best guess of the match between the tags and the
        words from the display text.

        This problem is difficult for the following reasons:
            * not all the words in the display name have associated tags
            * the quantity field is stored as a number, but it appears
              as a string in the display name
            * the comment is often a compilation of different comments in
              the display name

        """
        ret = []

        # strip parens from the token, since they often appear in the
        # display_name, but are removed from the comment.
        token = utils.normalizeToken(token)
        decimalToken = self.parseNumbers(token)

        for key, val in ingredientRow.iteritems():
            if isinstance(val, basestring):

                for n, vt in enumerate(utils.tokenize(val)):
                    if utils.normalizeToken(vt) == token:
                        ret.append(key.upper())

            elif decimalToken is not None:
                try:
                    if val == decimalToken:
                        ret.append(key.upper())
                except:
                    pass

        return ret

Exemple #7

0

Afficher le fichier

Fichier : TestTokenizer.py Projet : engina/jn-cpu

 def testTokens(self):
     tokens = utils.tokenize(self.str3)
     self.assertEqual(11, len(tokens))
     self.assertEqual('\n  two empty spaces and some escaped chars \\\"\\\' in normal textfollowed by a ', tokens[0]['token'])
     self.assertEqual('"dbl quote"', tokens[1]['token'])
     self.assertEqual(' and then a ', tokens[2]['token'])
     self.assertEqual("'single quote'", tokens[3]['token'])
     self.assertEqual('\nwait there is more!! ', tokens[4]['token'])
     self.assertEqual('"\'signle quotes\' inside a double quote"', tokens[5]['token'])
     self.assertEqual(' and ', tokens[6]['token'])
     self.assertEqual('\'"double quotes" inside a single quote\'', tokens[7]['token'])
     self.assertEqual('\nwait! there\\\'s more!! ', tokens[8]['token'])
     self.assertEqual('"escaped double quotes \\" and escaped single quotes\\\' "', tokens[9]['token'])
     self.assertEqual(' ', tokens[10]['token'])
     self.assertEqual(utils.TOKEN_NORMAL, tokens[0]['type'])
     self.assertEqual(utils.TOKEN_DBL_Q, tokens[1]['type'])
     self.assertEqual(utils.TOKEN_NORMAL, tokens[2]['type'])
     self.assertEqual(utils.TOKEN_SNG_Q, tokens[3]['type'])
     self.assertEqual(utils.TOKEN_NORMAL, tokens[4]['type'])
     self.assertEqual(utils.TOKEN_DBL_Q, tokens[5]['type'])
     self.assertEqual(utils.TOKEN_NORMAL, tokens[6]['type'])
     self.assertEqual(utils.TOKEN_SNG_Q, tokens[7]['type'])
     self.assertEqual(utils.TOKEN_NORMAL, tokens[8]['type'])
     self.assertEqual(utils.TOKEN_DBL_Q, tokens[9]['type'])
     self.assertEqual(utils.TOKEN_NORMAL, tokens[10]['type'])

Exemple #8

0

Afficher le fichier

Fichier : key_word_extractor.py Projet : subhasis256/ml_code_completion

def FrequentWords(data_dirs, suffixes, max_key_words):
  """
  Returns a dictionary of min(max_key_words, percentile_key_words), giving key
  word with its count.
  """
  matches = matchingFiles(data_dirs, suffixes)

  token_count = Counter()
  files_done = 0
  for file_name in matches:
    tokens = tokenize(file_name)
    for token in tokens:
      if len(token) == 0:
        continue
      try:
        token_count[token] += 1
      except:
        token_count[token] = 1
    files_done += 1
    if (files_done % 5000 == 0):
      print("Completed parsing %d files ..." % files_done)

#  num_key_words = min(max_key_words,
#                      math.ceil(percentile_key_words * len(token_count)))
  return token_count.most_common(max_key_words)

Exemple #9

0

Afficher le fichier

Fichier : taggers.py Projet : attardi/nlpnet

    def tag(self, text=None):
        """
        Tags the given text.
        
        :param text: a string or unicode object. Strings assumed to be utf-8
        :returns: a list of lists (sentences with tokens).
            Each sentence has (token, tag) tuples.
        """
        result = []
        if text:
            tokens = utils.tokenize(text, clean=False)
            for sent in tokens:
                tags = self.tag_tokens(sent)
                result.append(zip(sent, tags))
        else:
            # read tsv from stdin
            sent = []
            for line in sys.stdin:
                line = line.decode('utf-8').strip()
                if line:
                    sent.append(line.split()[0])
                else:
                    tags = self.tag_tokens(sent)
                    result.append(zip(sent, tags))
                    sent = []

        return result

Exemple #10

0

Afficher le fichier

Fichier : rlogin.py Projet : tegola-hubs/dendria

    def bird_info(self):
        birdv = self.machine.run("echo | birdc | head -1").strip().replace(" ready.", "")
        birdv = birdv.split(" ")
        info = {
            "daemon":  birdv[0],
            "version": birdv[1],
            "ospf": {}
            }

        log.info("[%s] getting OSPF neighbours" % self.hostname())
        output = self.machine.run("echo show ospf neighbors | birdc | sed '/^bird[^ ] .*/d'")
        neighbours = []
        for toks in [tokenize(l) for l in splitlines(output)[2:]]:
            neighbour = {
                "routerid": toks[0]
                }
            if toks[4][0] in ascii_letters:
                neighbour["ifname"] =  toks[4]
                neighbour["v4addr"] =  toks[5]
            else:
                neighbour["v4addr"] =  toks[4]
                neighbour["ifname"] =  toks[5]
            neighbours.append(neighbour)
        info["ospf"]["neighbours"] = neighbours
        return info

Exemple #11

0

Afficher le fichier

Fichier : cli.py Projet : NYTimes/ingredient-phrase-tagger

    def generate_data(self, count, offset):
        """
        Generates training data in the CRF++ format for the ingredient
        tagging task
        """
        df = pd.read_csv(self.opts.data_path)
        df = df.fillna("")

        start = int(offset)
        end = int(offset) + int(count)

        df_slice = df.iloc[start: end]

        for index, row in df_slice.iterrows():
            try:
                # extract the display name
                display_input = utils.cleanUnicodeFractions(row["input"])
                tokens = utils.tokenize(display_input)
                del(row["input"])

                rowData = self.addPrefixes([(t, self.matchUp(t, row)) for t in tokens])

                for i, (token, tags) in enumerate(rowData):
                    features = utils.getFeatures(token, i+1, tokens)
                    print utils.joinLine([token] + features + [self.bestTag(tags)])

            # ToDo: deal with this
            except UnicodeDecodeError:
                pass

            print

Exemple #12

0

Afficher le fichier

Fichier : jc_model.py Projet : Lonesome-George/nlp_project1

 def classify_proba(self, text):
     token_list = tokenize(text)
     token_list = del_stopwords(token_list, self.stopset)
     wordfreq_dict = stat_wordfreq(token_list)
     dictfeats = tfidf(wordfreq_dict, self.idf_dict)
     vecfeats = self.vectorizer.transform(dictfeats).toarray()
     prob = self.classifier.predict_proba(vecfeats)
     return prob[0]

Exemple #13

0

Afficher le fichier

Fichier : rlogin.py Projet : tegola-hubs/dendria

 def macaddr(self, iface):
     output = self.machine.run("ip link show dev %s | grep link/ether" % iface).strip()
     if not output:
         return None
     mac = tokenize(output)[1].upper()
     if len(mac.replace("0", "").replace(":", "")) == 0:
         return None
     return mac

Exemple #14

0

Afficher le fichier

Fichier : similar_articles.py Projet : fchantrel/habeascorpus

def find_similar_articles(corpus_name, method, content, data_dir=os.getcwd(), index=None):

    """
    - corpus_name : Le nom du corpus sur lequel on travaille (fichier .tsv 
        sans l'extension .tsv)
        
    - method : ldan (n = le nombre de topics), lsin ou tfidf
    
    - content : un texte
    
    Renvoie les 5 articles de corpus_name les plus proches du contenu spécifié 
    
    """

    corpus_file = os.path.join(data_dir, corpus_name + '_' + method + '.mm')
    index_file = os.path.join(data_dir, corpus_name + '_' + method + '_index')
    docid_file = os.path.join(data_dir, corpus_name + '_docid.txt')
    
    # Chargement du corpus
    try:
        corpus = corpora.mmcorpus.MmCorpus(corpus_file)
    except Exception:
        raise IOError('Impossible de charger le fichier %s. Avez-vous bien appliqué le script corpus_to_matrix.py ?' % (corpus_file))

    # Chargement du fichier d'index, s'il n'est pas fourni en argument
    if not index:
        try:
            index = similarities.docsim.Similarity.load(index_file)
        except Exception:
            raise IOError("""Impossible de charger le fichier %s. Avez-vous bien appliqué le script %s avec l'option --saveindex ?""" % (method, index_file))

    dico_file = os.path.join(data_dir, corpus_name + '_wordids.txt')

    # Chargement du dictionnaire
    try:
        id2word = corpora.dictionary.Dictionary.load_from_text(dico_file)
    except Exception:
        raise IOError("Impossible de charger le fichier %s" % (dico_file))

    # Chargement du modèle correspondant à la méthode voulue par l'utilisateur
    if method == 'tfidf':
        model_file = os.path.join(data_dir, corpus_name + '_tfidf_model')
        model = models.tfidfmodel.TfidfModel.load(model_file)

    elif method.startswith('lsi'):
        model_file = os.path.join(data_dir, corpus_name + '_' + args.method + '_model')
        model = models.lsimodel.LsiModel.load(model_file)

    elif method.startswith('lda'):
        model_file = os.path.join(data_dir, corpus_name + '_' + args.method + '_model')
        model = models.ldamodel.LdaModel.load(model_file)

    tokens = model[id2word.doc2bow(utils.tokenize(content))]

    # Renvoi des 5 articles les plus proches 
    sims = index[tokens]   
    sims = sorted(enumerate(sims), key=lambda item: -item[1])
    return json.dumps([{'id': utils.get_article_by_corpus_number(x[0], docid_file), 'score': round(x[1], 2)} for x in sims[:5]])

Exemple #15

0

Afficher le fichier

Fichier : extract_features.py Projet : Lonesome-George/nlp_project1

def word_freq(filenames, stopset):
    wordset = set()   # 全部单词集
    freqset_list = [[],[]] # 分别保存负向和正向文本的词频
    npos = 0 # 当前正向文本的数目
    nneg = 0 # 当前负向文本的数目
    icur = 0 # 当前所指向的正向或负向文本的下标
    for filename in filenames:
        fr = file(filename, 'r')
        while True:
            line = fr.readline().decode("utf-8")
            if len(line) == 0: # Zero length indicates EOF
                break
            id,label,text = proc_line(line)
            token_list = tokenize(text)
            token_list = del_stopwords(token_list, stopset)
            wordfreq_dict = {}
            for token in token_list:
                wordset.add(token) # 将单词加入全部单词集
                if wordfreq_dict.has_key(token):
                    wordfreq_dict[token] += 1
                else:
                    wordfreq_dict[token] = 1
            doc = [id, label, wordfreq_dict] # 用列表记录每篇文本的id,label和词频
            # 将文本加入指定列表
            index = 0
            if label == '1':
                index = 1
                freqset_list[1].append(doc)
                icur = npos
                npos += 1
            elif label == '-1':
                index = 0
                freqset_list[0].append(doc)
                icur = nneg
                nneg += 1
            else:
                print 'tag-unknown text'
                continue
        fr.close()
        # 将特征词保存至文件中
        f = open('./Training/WordSet.txt', 'w')
        for word in wordset:
            string = word + '\n'
            f.write(string.encode("utf-8"))
        f.close()
        # 将原始词频保存至文件中
        f = open('./Training/WordFreq_Orig.txt', 'w')
        for i in range(2):
            for freqset in freqset_list[i]:
                id = freqset[0]
                label = freqset[1]
                freq_list = freqset[2]
                string = id + '\t' + label + '\t'
                for word in freq_list:
                    string += word + ',' + str(freq_list[word]) + ';'
                string += '\n'
                f.write(string.encode('utf-8'))
    return wordset, freqset_list

Exemple #16

0

Afficher le fichier

Fichier : rlogin.py Projet : tegola-hubs/dendria

 def v4addr(self, iface):
     output = self.machine.run("ip addr show dev %s | grep '^ *inet '" % iface).strip()
     def parseaddr(a):
         a = a.strip()
         if "/" not in a:
             return a + "/32"
         return a
     tokset = [tokenize(l) for l in splitlines(output)]
     return [parseaddr(toks[1]) for toks in tokset if len(toks) > 0]

Exemple #17

0

Afficher le fichier

Fichier : export.py Projet : donvel/affiliations

def find_word_freq(li):
    all_tokens = [normalize(t, lowercase=False)
             for aff in li
             for t in tokenize(text_in_element(aff),
                 split_alphanum=split_alphanum)]
    freq = defaultdict(int)
    for token in all_tokens:
        freq[token] += 1
    return freq

Exemple #18

0

Afficher le fichier

Fichier : export.py Projet : donvel/affiliations

def dict_from_file(filename, match_case=True):
    d = defaultdict(list)
    with codecs.open(DICTS_DIR + filename, 'rb', encoding='utf8') as f:
        for line in f:
            tokens = tokenize(normalize(line, lowercase=(not match_case)),
                    split_alphanum=split_alphanum)
            for (nb, token) in enumerate(tokens):
                d[token] += [(tokens, nb)]
        return (d, match_case)

Exemple #19

0

Afficher le fichier

Fichier : glove_predict.py Projet : 5vision/kaggle_allen

def predict_answers(data, word2vec, N):

    stop = stopwords.words('english')

    pred_answs = []
    for i in range(data.shape[0]):
        #calculate word2vec for question
        q_vec = np.zeros(N)
        for w in tokenize(data['question'][i]):
            if w.lower() in word2vec and w.lower() not in stop:
                q_vec += word2vec[w.lower()]
        q_vec = q_vec / linalg.norm(q_vec)
    
        #calculate word2vec for answers
        A_vec = np.zeros(N)
        B_vec = np.zeros(N)
        C_vec = np.zeros(N)
        D_vec = np.zeros(N)
        for w in tokenize(data['answerA'][i]):
            if w.lower() in word2vec  and w.lower() not in stop:
                A_vec += word2vec[w.lower()]
    
        for w in tokenize(data['answerB'][i]):
            if w.lower() in word2vec and w.lower() not in stop:
                B_vec += word2vec[w.lower()]
            
        for w in tokenize(data['answerC'][i]):
            if w.lower() in word2vec and w.lower() not in stop:
                C_vec += word2vec[w.lower()]
    
        for w in tokenize(data['answerD'][i]):
            if w.lower() in word2vec and w.lower() not in stop:
                D_vec += word2vec[w.lower()]
    
        A_vec = A_vec / linalg.norm(A_vec) 
        B_vec = B_vec / linalg.norm(B_vec)
        C_vec = C_vec / linalg.norm(C_vec)
        D_vec = D_vec / linalg.norm(D_vec)
        
        #choose question based on cosine distance
        idx = np.concatenate((A_vec, B_vec, C_vec, D_vec)).reshape(4, N).dot(q_vec).argmax()
        pred_answs.append(["A", "B", "C", "D"][idx])
        
    return pred_answs

Exemple #20

0

Afficher le fichier

Fichier : build.py Projet : codekansas/citation-generator

def build_vocab(docs, save_as):
    start = time.time()
    vocab = set()
    for file in utils.iterate_corpus(docs):
        with open(file, 'r') as f:
            tokenized = itertools.chain.from_iterable(utils.tokenize(line) for line in f.readlines())
        vocab.update(tokenized)
    vocab = list(vocab)
    pkl.dump(vocab, open(save_as, 'wb'))
    print('Built vocabulary and saved it to "%s" in %s' % (save_as, utils.strtime(time.time() - start)), file=sys.stderr)
    return vocab

Exemple #21

0

Afficher le fichier

Fichier : ck12_wiki_predict.py Projet : johnkorn/kaggle_allen

def predict(data, docs_per_q):  
    #index docs
    docs_tf, words_idf = utils.get_docstf_idf(wiki_docs_dir)
    
    res = []
    f = []
    for index, row in data.iterrows():
        #get answers words 
        w_A = set(utils.tokenize(row['answerA']))
        w_B = set(utils.tokenize(row['answerB']))
        w_C = set(utils.tokenize(row['answerC']))
        w_D = set(utils.tokenize(row['answerD']))
    
        sc_A = 0
        sc_B = 0
        sc_C = 0
        sc_D = 0
    
        q = row['question']
        
        for d in zip(*utils.get_docs_importance_for_question(q, docs_tf, words_idf, docs_per_q))[0]:
            for w in w_A:
                if w in docs_tf[d]:
                    sc_A += 1. * docs_tf[d][w] * words_idf[w]
            for w in w_B:
                if w in docs_tf[d]:
                    sc_B += 1. * docs_tf[d][w] * words_idf[w]
            for w in w_C:
                if w in docs_tf[d]:
                    sc_C += 1. * docs_tf[d][w] * words_idf[w]
            for w in w_D:
                if w in docs_tf[d]:
                    sc_D += 1. * docs_tf[d][w] * words_idf[w]

        res.append(['A','B','C','D'][np.argmax([sc_A, sc_B, sc_C, sc_D])])
        f.append([sc_A, sc_B, sc_C, sc_D])        
     
    features = np.array(f)
    pd.DataFrame({'id': list(data['id']),'fA': features[:,0], 'fB': features[:,1], 'fC': features[:,2], 'fD': features[:,3]})[['id', 'fA', 'fB', 'fC', 'fD']].to_csv('features_ck12.csv', index = False)
    
    return res

Exemple #22

0

Afficher le fichier

Fichier : models.py Projet : NUKnightLab/neighborhood-buzz

    def __init__(self, data, minimum_vocab_fraction=.02, include_ngrams=True):
        self.doc_freq = FreqDist()
        for count, (label, text) in enumerate(data, start=1):
            for word in set(utils.tokenize(text, include_ngrams, limit_ngrams=True)):
                self.doc_freq.inc(word)
        self.doc_count = count

        self.min_vocab_freq = 1
        self.max_vocab_freq = .95 * self.doc_count
        print 'Min/max vocabulary frequency:', self.min_vocab_freq, self.max_vocab_freq

        self.features = sorted(filter(self._is_valid_feature, self.doc_freq))

Exemple #23

0

Afficher le fichier

Fichier : load_data.py Projet : vinodrajendran001/ml_research

def load_mol_data(calc_set, opt_set, struct_set, prop_set=None):
    '''
    Load data from data sets and return lists of structure names, full paths
    to the geometry data, the properties, and the meta data.
    '''
    print "Dataset options used"
    print "\tCalculation methods:", calc_set
    print "\tOptimization methods:", opt_set
    print "\tStructure sets:", struct_set
    print "\tProperties:", prop_set
    names = []
    datasets = []
    geom_paths = []
    properties = []
    meta = []
    lengths = []

    for j, base_path in enumerate(opt_set):
        for i, file_path in enumerate(calc_set):
            for m, atom_set in enumerate(struct_set):
                path = os.path.join(DATA_BASE_DIR, "mol_data", base_path, atom_set, file_path)
                with open(path + ".txt", 'r') as f:
                    for line in f:
                        temp = line.split()
                        name, props = temp[0], temp[1:]

                        names.append(name)
                        datasets.append((base_path, file_path, atom_set))

                        geom_path = os.path.join(DATA_BASE_DIR, "mol_data", base_path, 'geoms', 'out', name + '.out')
                        geom_paths.append(geom_path)

                        properties.append([float(x) for x in props])

                        # Add part to feature vector to account for the 4 different data sets.
                        base_part = [i == k for k, x in enumerate(opt_set)]
                        # Add part to feature vector to account for the 3 different methods.
                        method_part = [j == k for k, x in enumerate(calc_set)]
                        # Add part to feature vector to account for the addition of N.
                        atom_part = [m == k for k, x in enumerate(struct_set)]
                        # Add bias feature
                        bias = [1]
                        meta.append(base_part + method_part + atom_part + bias)

                        tokens = tokenize(name, explicit_flips=True)
                        aryl_count = sum([1 for x in tokens if x in ARYL])
                        lengths.append(aryl_count)

    prop_desc = (("H**O", "eV"), ("LUMO", "eV"), ("Excitation", "eV"))
    prop_vals = zip(*properties)
    prop_out = [(x, y, z) for ((x, y), z) in zip(prop_desc, prop_vals)]
    return names, datasets, geom_paths, prop_out, meta, lengths

Exemple #24

0

Afficher le fichier

Fichier : rlogin.py Projet : tegola-hubs/dendria

    def quagga_info(self):
        output = self.machine.run("zebra --version")
        info = {
            "daemon": "Quagga",
            "version": tokenize(splitlines(output)[0])[-1],
            "ospf": {}
            }

        neighbours = []
        log.info("[%s] getting OSPF neighbours" % self.hostname())
        output = self.machine.run("echo show ip ospf neighbor | vtysh | grep '^[1-9]'")
        for toks in [tokenize(l) for l in splitlines(output)]:
            if len(toks) == 0:
                continue
            neighbour = {
                "routerid": toks[0],
                "v4addr":   toks[4],
                "ifname":   toks[5].split(":")[0]
                }
            neighbours.append(neighbour)
        info["ospf"]["neighbours"] = neighbours
        return info

Exemple #25

0

Afficher le fichier

Fichier : build.py Projet : codekansas/citation-generator

def build_index(docs, vocab, save_as):
    start = time.time()
    word2idx = dict([(v, k) for k, v in enumerate(vocab)])
    tf = dict([(i, list()) for i in xrange(len(vocab))])
    df = Counter()
    n_docs = len(list(utils.iterate_corpus(docs)))
    files = list()
    for i, file in enumerate(utils.iterate_corpus(docs)):
        print('%d/%d %s' % (i+1, n_docs, utils.strtime(time.time() - start)), file=sys.stderr, end='\r')
        files.append(file)
        with open(file, 'r') as f:
            text = f.read()
            word_counts = Counter(word2idx[w] for w in utils.tokenize(text))
            df.update(word2idx[w] for w in set(utils.tokenize(text)))
            n_words = utils.counter_sum(word_counts)
            for word, count in word_counts.items():
                tf[word].append((count / math.log(n_words), i))
    for word, docs in tf.items():
        docs.sort(key=lambda x: x[0], reverse=True)
    tfidf = tf, df, files
    pkl.dump(tfidf, open(save_as, 'wb'))
    print('Processed %d documents in %s' % (n_docs, utils.strtime(time.time() - start)), file=sys.stderr)
    return tfidf

Exemple #26

0

Afficher le fichier

Fichier : taggers.py Projet : chrisleewashere/nlpnet

 def tag(self, text):
     """
     Tags the given text.
     
     :param text: a string or unicode object. Strings assumed to be utf-8
     :returns: a list of lists (sentences with tokens). Each sentence has (token, tag) tuples.
     """
     tokens = utils.tokenize(text, clean=False)
     result = []
     for sent in tokens:
         tags = self.tag_tokens(sent)
         result.append(zip(sent, tags))
     
     return result

Exemple #27

0

Afficher le fichier

Fichier : sentenceClusterer.py Projet : ddeeps2610/Planner

 def clusterSentence(self, sentence):
     """
     clusters the given sentence with existing cluster or creates a
     new cluster.
     sentence - sentence to be clustered
     """
     words = utils.tokenize(sentence.lower())
     lems = utils.lemmatize(words)
     terms = utils.filterStopWords(lems)
     tf = dict(Counter(terms))
     self.clusterize(tf, sentence)  
  
     # Every time a new sentence is clusterized, save latest clusters
     self.saveClusters()

Exemple #28

0

Afficher le fichier

Fichier : 1_ck12_wiki_predict.py Projet : Sirorezka/a-l-l-e-n-_-m-a-s-t-_-r

def predict(data, docs_per_q):  
    #index docs
    docs_tf, words_idf = utils.get_docstf_idf(wiki_docs_dir)
    
    res = []
    doc_score = [["A","B","C","D"]]
    for index, row in data.iterrows():
        #get answers words
        w_A = set(utils.tokenize(row['answerA']))
        w_B = set(utils.tokenize(row['answerB']))
        w_C = set(utils.tokenize(row['answerC']))
        w_D = set(utils.tokenize(row['answerD']))
    
        sc_A = 0
        sc_B = 0
        sc_C = 0
        sc_D = 0
    
        q = row['question']
        
        for d in list(zip(*utils.get_docs_importance_for_question(q, docs_tf, words_idf, docs_per_q)))[0]:
            for w in w_A:
                if w in docs_tf[d]:
                    sc_A += 1. * docs_tf[d][w] * words_idf[w]
            for w in w_B:
                if w in docs_tf[d]:
                    sc_B += 1. * docs_tf[d][w] * words_idf[w]
            for w in w_C:
                if w in docs_tf[d]:
                    sc_C += 1. * docs_tf[d][w] * words_idf[w]
            for w in w_D:
                if w in docs_tf[d]:
                    sc_D += 1. * docs_tf[d][w] * words_idf[w]

        res.append(['A','B','C','D'][np.argmax([sc_A, sc_B, sc_C, sc_D])])
        doc_score.append([sc_A, sc_B, sc_C, sc_D])
    return res, doc_score

Exemple #29

0

Afficher le fichier

Fichier : similar_articles.py Projet : lewer/scripts-diplo

def find_similar_articles(corpus_name, method, id=None, content=None):

    corpus_file = corpus_name + '_' + method + '.mm'
    index_file = corpus_name + '_' + method + '_index'
    docid_file = corpus_name + '_docid.txt'

    try:
        corpus = corpora.mmcorpus.MmCorpus(corpus_file)
    except Exception:
        raise IOError('Impossible de charger le fichier %s' % (corpus_file))

    try:
        index = similarities.docsim.Similarity.load(index_file)
    except Exception:
        raise IOError('Impossible de charger le fichier %s' % (index_file))

    if id is not None:  
        corpus_id = utils.get_article_by_id(id, docid_file)
        tokens = corpus[corpus_id]

    elif content is not None:
        dico_file = corpus_name + '_wordids.txt'

        try:
            id2word = corpora.dictionary.Dictionary.load_from_text(dico_file)
        except Exception:
            raise IOError("Impossible de charger le fichier %s" % (dico_file))

        if method == 'tfidf':
            model_file = corpus_name + '_tfidf_model'
            model = models.tfidfmodel.TfidfModel.load(model_file)

        elif method.startswith('lsi'):
            model_file = corpus_name + '_' + args.method + '_model'
            model = models.lsimodel.LsiModel.load(model_file)

        elif method.startswith('lda'):
            model_file = corpus_name + '_' + args.method + '_model'
            model = models.ldamodel.LdaModel.load(model_file)

        tokens = model[id2word.doc2bow(utils.tokenize(content))]

    else:
        raise Exception("Il faut fournir un id ou un contenu")

    sims = index[tokens]   
    sims = sorted(enumerate(sims), key=lambda item: -item[1])

    return [(utils.get_article_by_corpus_number(x[0], docid_file), x[1]) for x in sims[:5]]

Exemple #30

0

Afficher le fichier

Fichier : taggers.py Projet : chrisleewashere/nlpnet

 def tag(self, text, no_repeats=False):
     """
     Runs the SRL process on the given text.
     
     :param text: unicode or str encoded in utf-8.
     :param no_repeats: whether to prevent repeated argument labels
     :returns: a list of SRLAnnotatedSentence objects
     """
     tokens = utils.tokenize(text, clean=False)
     result = []
     for sent in tokens:
         tagged = self.tag_tokens(sent)
         result.append(tagged)
     
     return result

Exemple #31

0

Afficher le fichier

from collections import Counter

import nltk
import numpy as np
import pandas

# noinspection PyUnresolvedReferences
from utils import tokenize

# importing corpus as resume
resume_file = open('../assets/resume.txt', 'r')
resume = resume_file.read().lower()
resume_file.close()

# tokenizing the resume
tokens = tokenize(resume)

# dividing corpus into 6 documents
k = len(tokens) // 6
documents = []
for i in range(5):
    documents.append(tokens[i * k:(i + 1) * k])
documents.append(tokens[5 * k:])

# calculating most common 5 tokens from each document and storing frequency tables for each document
most_common = set()
document_frequencies = []
for document in documents:
    frequencies = Counter(document)
    document_frequencies.append(frequencies)
    for word, frequency in frequencies.most_common(5):

Exemple #32

0

Afficher le fichier

    def read_training_dataset(self, input_path):
        with open(input_path) as f:

            data = json.load(f)
            self.no_samples = len(data)

            # for padding.
            self.words_converter.T2id('<PAD>')

            self.words_converter.T2id('<SOS>')

            self.slots_converter.T2id('<PAD>')
            self.slots_converter.T2id('<SOS>')

            self.slots_converter.T2id('-')

            for i in tqdm(range(self.no_samples)):

                entry = data[str(i)]

                text = entry["text"]
                text = normalizeString(text)
                tokens = tokenize(text)
                self.stcs_literals.append(tokens)
                tokens_id = [self.words_converter.T2id(id) for id in tokens]
                tokens_id.append(self.words_converter.T2id('</s>'))
                self.stcs.append(tokens_id)
                self.lengths.append(len(tokens_id))

                intent = entry["intent"]

                self.intents.append(self.intent_converter.T2id(intent))

                slots_dictionary = entry["slots"]
                # +1 make room for <SOS>
                slots_id = [self.slots_converter.T2id('-')] * len(tokens_id)
                slots_id[0] = self.slots_converter.T2id('<SOS>')

                no_slots_in_stc = 0
                for slot, target_words in slots_dictionary.items():
                    target_words = normalizeString(target_words)
                    target_word_list = tokenize(target_words)
                    for word in target_word_list:
                        no_slots_in_stc += 1
                        try:
                            idx = tokens.index(word)
                        except:
                            idx = [
                                i for i, s in enumerate(tokens) if word in s
                            ][0]

                        # +1 account for <SOS>
                        slots_id[idx + 1] = self.slots_converter.T2id(slot)

                # keep count of no slots
                for j in range(len(tokens_id) - no_slots_in_stc):
                    self.slots_converter.T2id('-')

                self.slots.append(slots_id)
                # self.slots.append(torch.tensor(slots_id, dtype=torch.long, device=self.device))

            # add padding

            ncols = max(self.lengths)

            self.X = self.stcs
            self.Y = self.slots

Exemple #33

0

Afficher le fichier

Fichier : generate_wordcloud.py Projet : tomasborrella/disaster-response-pipeline

# import libraries
import pandas as pd
from sqlalchemy import create_engine
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from utils import tokenize


print('Loading data...')
engine = create_engine('sqlite:///data/DisasterResponse.db')
df = pd.read_sql_table('disaster_message_category', engine)

print('Tokenizing words...')
word_string = " ".join(df['message'])
word_string_final = " ".join(tokenize(word_string))

print('Creating wordcloud...')
wordcloud = WordCloud(width=800,
                      height=400,
                      background_color='white',
                      max_words=300).generate(word_string_final)

print('Generating png image...')
# plot the WordCloud image
plt.figure(figsize=(8, 4), facecolor=None)
plt.imshow(wordcloud)
plt.axis("off")
plt.tight_layout(pad = 0)
plt.savefig('app/static/images/wordcloud.png', dpi=105)

Exemple #34

0

Afficher le fichier

Fichier : questplusplus.py Projet : zouharvi/ptakopet-server

    def qe(self, sourceLang, targetLang, sourceText, targetText):
        """
        Performs translation quality estimation on sourceText to targetText using QuEst++ and fast_align
        It's ok to raise Exceptions here. They are handled upstream.
        """
        os.makedirs('data/tmp', exist_ok=True)

        if not [sourceLang, targetLang] in self.supportedPairs:
            raise Exception("{}-{} language pair not supported".format(
                sourceLang, targetLang))

        # Sanitize input
        aligned = hunalign(sourceText, targetText)
        sourceText = [tokenize(x[0], sourceLang, False) for x in aligned]
        targetText = [tokenize(x[1], sourceLang, False) for x in aligned]
        sourceTextPlain = '\n'.join([' '.join(x) for x in sourceText])
        targetTextPlain = '\n'.join([' '.join(x) for x in targetText])

        alignments = fast_align.FastAlign().align(sourceLang, targetLang,
                                                  sourceTextPlain,
                                                  targetTextPlain)['alignment']
        with open('data/tmp/alignments', 'w') as fileAlignments:
            fileAlignments.write(alignments)

        with open('data/tmp/source', 'w') as fileSource:
            fileSource.write(sourceTextPlain)

        with open('data/tmp/target', 'w') as fileTarget:
            fileTarget.write(targetTextPlain)

        with DirCrawler('qe/questplusplus'):
            print("Extracting features")
            (_output, _error) = bash("""
                 java -cp QuEst++.jar:lib/* shef.mt.WordLevelFeatureExtractor
                 -lang english spanish
                 -input ../../data/tmp/source ../../data/tmp/target
                 -alignments ../../data/tmp/alignments
                 -config ../questplusplus-config/config.word-level.properties
                 """)

            outputFile = 'output/test/output.txt'
            if not os.path.isfile(outputFile):
                raise Exception('Server Processing Error')
            with open(outputFile, 'r') as outputFileR:
                features = outputFileR.readlines()

        os.remove('data/tmp/alignments')
        os.remove('data/tmp/source')
        os.remove('data/tmp/target')

        features = [[
            x.split('=')[1] for x in line.rstrip('\n').rstrip('\t').split('\t')
        ] for line in features]
        with open('data/tmp/features', 'w') as fileFeatures:
            fileFeatures.write('\n'.join(['\t'.join(x) for x in features]))
        with open('data/tmp/labels', 'w') as fileLabels:
            fileLabels.write('\n'.join(['1'] * len(features)))

        with DirCrawler('qe/questplusplus'):
            print("Removing output directory structure for feature extractor")
            os.remove(outputFile)
            os.rmdir('output/test')
            os.rmdir('output')

            print("Machine Learning")
            (_output, _error) = bash(f"""
                python learning/src/learn_model.py ../questplusplus-config/svr_{sourceLang}_{targetLang}.cfg
                """)

            with open('predicted.csv', 'r') as predictedFile:
                output = [
                    float(x.rstrip('\n').split('\t')[1])
                    for x in predictedFile.readlines()
                ]
            os.remove('predicted.csv')

        os.remove('data/tmp/features')
        os.remove('data/tmp/labels')
        os.rmdir('data/tmp')
        return {'status': 'OK', 'qe': output}

Exemple #35

0

Afficher le fichier

Fichier : main.py Projet : simtony/Neural-Poetry-Generator

    # Evaluation setup
    'sample': '如',
    'max_sample_length': 50,
    'sample_range':
    2  # how many words in the dictionary to be considered when sampling
}

# -------------------------Data feeding preparation---------------
# Read and tokenize data
texts = [
    './data/qts_tab.txt', './data/qsc_tab.txt', './data/qtais_tab.txt',
    './data/qss_tab.txt'
]
# max and min length of poem sequence
maxlen = 100
minlen = 7
poems = []
# for t in texts:
#     poems.extend(utils.read_poem(t))
for t in texts:
    poems.extend(utils.read_regular_poem(t))

poems = utils.chop_poems(poems, maxlen, minlen)
data, count, dictionary, reverse_dictionary = utils.tokenize(
    poems, params['vocabulary_size'])

rnnlm = language_model.RNNLM(params, data, count, dictionary,
                             reverse_dictionary)
rnnlm.train(sample_interval=100, save_interval=5000, logger=None)
# rnnlm.sample(sample_len=100, checkpoint_dir='./tmp/rnndata/')

Exemple #36

0

Afficher le fichier

data = pd.read_csv('../data/data.csv', skiprows=0)
filtered = data[[
    'REGI', 'TYPO', 'VISUAL_SIMILARITY', 'SOUNDEX_DISTANCE'
]][(data['EDIT_DISTANCE'] == 1) & (data['IS_TYPO'] == 1)
   & ((data['VISUAL_SIMILARITY'] >= 0.8) | (data['SOUNDEX_DISTANCE'] <= 1))]
filtered = filtered[filtered.TYPO.map(lambda x: x.count('.')) == 2]
filtered = filtered[filtered.REGI.map(lambda x: x.count('.')) == 2]
filtered.reset_index(drop=True, inplace=True)

reg_list = list()
typo_list = list()
for i in range(t.shape[0]):
    reg_list.append(filtered['REGI'][i].split('.')[0])
    typo_list.append(filtered['TYPO'][i].split('.')[0])

in_list, out_list = utils.tokenize(reg_list, typo_list, token_size)

in_vocab = set()
out_vocab = set()
for name in in_list:
    for char in name:
        in_vocab.add(char)
for name in out_list:
    for char in name:
        out_vocab.add(char)
vocab = in_vocab.union(out_vocab)
num_encoder_tokens = len(in_vocab)
num_decoder_tokens = len(out_vocab)
max_encoder_seq_length = max([len(name) for name in in_list])
max_decoder_seq_length = max([len(name) for name in out_list])

Exemple #37

0

Afficher le fichier

Fichier : html_to_json.py Projet : catwy/mayors

def html_to_json(url):
    category, uid = tokenize(url)
    schema_name = 'schema/{}.json'.format(category)
    with open(schema_name, 'rb') as fp:
        template = json.load(fp)
    html_doc = get_html(url)
    soup = BeautifulSoup(html_doc, 'html.parser')

    table_title = None
    result = {}
    ignore_image = True
    for tr in soup.find_all('tr'):
        # keep only the most bottom level tr
        if tr.find_all('tr'):
            continue
        is_title_row = False
        row_content = []
        for td in tr.find_all('td'):
            if ignore_image and td.find_all('img'):
                continue
            text = clean_up(td.text)
            if text in template:
                table_title = text
                is_title_row = True
                row_titles = template[table_title]
                ignore_image = row_titles['ignore image']
                result[table_title] = {}
                break
            link = ''
            for a in td.find_all('a'):
                link = a.get('href')
            row_content.append({'text': text, 'link': link})

        if is_title_row:
            continue

        if not row_content or not table_title:
            continue

        column_index = row_titles['column index']
        strict_match = row_titles['strict match']
        regex_match = row_titles['regex match']
        terminate_on_mismatch = row_titles['terminate on mismatch']

        matched = False
        if len(row_content) > column_index + 1:
            candidate_row_title = row_content[column_index]['text']
            for s in strict_match:
                if s == candidate_row_title and s not in result[table_title]:
                    matched = True
                    result[table_title][s] = row_content[column_index + 1:]
                    break
            if not matched:
                for s in regex_match:
                    if s in candidate_row_title:
                        matched = True
                        result[table_title][u'Certified Votes'] = row_content[column_index + 1:]
                        break
                    if re.match(s, candidate_row_title):
                        matched = True
                        category, race_id = tokenize(row_content[column_index + 1]['link'])
                        result[table_title][race_id] = row_content[column_index:]
                        break
        if terminate_on_mismatch and not matched:
            table_title = None
            ignore_image = True
    return result

Exemple #38

0

Afficher le fichier

Fichier : construct_vocab.py Projet : cgl/turkish-parliament-texts

    args = parser.parse_args()

    logging.basicConfig(filename=args.log_filepath,
                        format='%(asctime)s : %(levelname)s : %(message)s',
                        level=logging.INFO)

    if args.command == "construct_vocab":

        dictionary = corpora.Dictionary()

        count = 0

        line = sys.stdin.readline()
        while line:

            tokens = tokenize(line)

            dictionary.add_documents([tokens], prune_at=None)
            count += 1

            if count % 100000 == 0:
                print_err("line %d %d" % (count, len(dictionary)))

            line = sys.stdin.readline()

        dictionary.save(args.vocabulary_filename)
        dictionary.save_as_text(args.vocabulary_filename + ".txt")

    elif args.command == "construct_corpus":
        # use glob to recurse under data/TXTs directory

Exemple #39

0

Afficher le fichier

Fichier : convert_utils.py Projet : amazingguni/commits-dataset

def convert_filename(filename):
    tokenized_filename = tokenize(filename.replace('/', '.'))
    return f'{constants.FILE_START} {tokenized_filename} {constants.FILE_END}\n'

Exemple #40

0

Afficher le fichier

Fichier : semcor.py Projet : aseeng/Semcor

from nltk.corpus import semcor
import utils

count = 0
num_sentences = 0
for i in range(100):
    sent = semcor.xml('brown2/tagfiles/br-n12.xml').findall('context/p/s')[i]

    sentence = ""
    name = ""

    for wordform in sent.getchildren():
        sentence += wordform.text + " "
        if wordform.get('pos') == "NN" and wordform.text != "anyone":
            name = wordform.text
            sense_key = wordform.get('lexsn')

    context = utils.tokenize(sentence)

    if name is not "":
        best_sense = utils.find_synset(context,name)
        num_sentences += 1
        if sense_key == best_sense.lemmas()[0].key()[-9:]:
            count += 1
    if num_sentences == 50:
        break

print("accuracy = " + str(count*100/num_sentences) + " %")

Exemple #41

0

Afficher le fichier

Fichier : train_val.py Projet : Niksv98/keras_speller_eng

# http://arxiv.org/abs/1410.4615
# "Sequence to Sequence Learning with Neural Networks"
# https://arxiv.org/abs/1409.3215
reverse = True

data_path = './data'
train_books = [
    'nietzsche.txt', 'pride_and_prejudice.txt', 'shakespeare.txt',
    'war_and_peace.txt', 'botanical_2.txt', 'botanical_terms.txt'
]
val_books = ['wonderland.txt', 'botanical_1.txt']

if __name__ == '__main__':
    # Prepare training data.
    text = read_text(data_path, train_books)
    vocab = tokenize(text)
    vocab = list(filter(None, set(vocab)))

    # `maxlen` is the length of the longest word in the vocabulary
    # plus two SOS and EOS characters.
    maxlen = max([len(token) for token in vocab]) + 2
    train_encoder, train_decoder, train_target = transform(
        vocab, maxlen, error_rate=error_rate, shuffle=False)
    print(train_encoder[:10])
    print(train_decoder[:10])
    print(train_target[:10])

    input_chars = set(' '.join(train_encoder))
    target_chars = set(' '.join(train_decoder))
    nb_input_chars = len(input_chars)
    nb_target_chars = len(target_chars)

Exemple #42

0

Afficher le fichier

Fichier : russian.rt.com.py Projet : fostroll/ru_corner

        #                                  .replace('ё', 'ё') \
        #                                  .strip()
        line = utils.norm_text2(re2.sub('', line))
        if line:
            lines.append(' '.join(line.split()))
    if len(lines) >= _utils.MIN_TEXT_LINES:
        texts_total += 1
        if link_no > start_link_idx:
            with open(page_fn, 'wt', encoding='utf-8') as f:
                print(link, file=f)
                f.write(page)
        with open(text_fn, 'wt', encoding='utf-8') as f:
            print(link, file=f)
            print(header, file=f)
            f.write('\n'.join(lines))
        print('\r{} (of {})'.format(texts_total,
                                    min(utils.TEXTS_FOR_SOURCE, num_links)),
              end='')
        need_enter = True
    #exit()
if need_enter:
    print()
'''===========================================================================
Chunks creation
==========================================================================='''
_utils.make_chunks(num_links)
'''===========================================================================
Tokenization
==========================================================================='''
utils.tokenize(num_links, isdialog=False)

Exemple #43

0

Afficher le fichier

def generate_repo_dataset(fullname, branch, sha_list, repo_dir, writer):
    repo = Repo(repo_dir)
    total_cnt, current_cnt, msg_skip, diff_skip, word_skip = 0, 0, 0, 0, 0
    index_list, origin_target_list, target_list, origin_line_list, line_list, origin_word_list, word_list = [],[],[],[],[],[],[]
    for sha in sha_list:
        commit = repo.commit(sha)
        total_cnt += 1
        commit_msg = commit.message
        sentences = split_sentence(commit_msg)
        if not sentences:
            continue
        commit_msg = sentences[0].strip()
        commit_msg_lower = commit_msg.lower()
        if 'revert' in commit_msg_lower or commit_msg_lower.startswith('merge '):
            msg_skip += 1
            continue
        commit_msg = remove_redundant_white_space(commit_msg.strip())
        origin_commit_msg = commit_msg
        if not commit_msg:
            msg_skip += 1
            continue
        
        commit_msg = tokenize(commit_msg)
        commit_msg = remove_last_special_char(commit_msg.strip())
        commit_msg = remove_no_english_str(commit_msg)
        commit_msg = remove_redundant_white_space(commit_msg.strip())
        commit_msg = commit_msg.strip()
        if not commit_msg:
            msg_skip += 1
            continue
        commit_words = commit_msg.split()
        # if not starts_with_verb(commit_words):
        #     msg_skip += 1
        #     continue
        if len(commit_words) > constants.TARGET_SEQ_LEN_MAX:
            msg_skip += 1
            continue
            
        line_diff = get_line_diff(repo_dir, sha)
        if not line_diff:
            diff_skip += 1
            continue
        origin_line_diff = line_diff
        line_diff = remove_no_english_str(line_diff)
        line_diff = remove_redundant_white_space(line_diff.strip())
        line_diff_words = line_diff.split()
        if not overlap_two_seq(line_diff_words, commit_words):
            diff_skip+=1
            continue

        if len(line_diff_words) > constants.SOURCE_SEQ_LEN_MAX:
            diff_skip+=1
            continue
        
        word_diff = get_word_diff(repo_dir, sha)
        if not word_diff:
            word_skip += 1
            continue
        origin_word_diff = word_diff
        word_diff = remove_no_english_str(word_diff)
        word_diff = remove_redundant_white_space(word_diff.strip())
        if not word_diff:
            word_skip += 1
            continue
        word_diff_words = word_diff.split()
        index = f'{fullname} {sha}'
        writer.write(index, origin_commit_msg, commit_msg, origin_line_diff, line_diff, origin_word_diff, word_diff)
        current_cnt+=1
    print(f'{fullname}:  {current_cnt}/{total_cnt}')
    return current_cnt

Exemple #44

0

Afficher le fichier

def search(*arguments):
    print("Loading Files")
    outfile = open("./query_op.txt", 'w')
    with open(arguments[0], 'r') as f:
        queries = f.readlines()
    with open("./inverted_index/titleOffset.txt", 'r') as f:
        titleOffSet = [int(line.strip()) for line in f]
    with open("./inverted_index/offset.txt", 'r') as f:
        offset = []
        for line in f.readlines():
            try:
                offset.append(int(line.strip()))
            except BaseException:
                continue
    vocabFile = open("./inverted_index/vocab.txt", 'r')
    titleFile = open("./inverted_index/title.txt", 'r')
    with open("./inverted_index/fileNumbers.txt", 'r') as f:
        nFiles = int(f.read().strip())
    key_words = ['t:', 'b:', 'i:', 'c:', 'r:', 'l:']
    print("Starting Queries")
    numQueries = 0
    for query in queries:
        startTime = time.time()
        numQueries += 1
        query = query.strip().lower()
        numResults, query = query.split(",")
        query = query.strip()
        numResults = int(numResults)
        queryType = "Plain"
        for w in key_words:
            if w in query:
                queryType = "Field"
                break

        if queryType == "Field":
            q = re.split("(t:)|(b:)|(i:)|(c:)|(r:)|(l:)", query)
            q = [i.strip() for i in q if i is not None and i != ""]
            queryDict = defaultdict(list)
            for idx in range(0, len(q), 2):
                data = tokenize(q[idx + 1].lower())
                data = [w for w in data if w not in stopWords]
                data = stemmer.stemWords(data)
                queryDict[q[idx].split(":")[0]].extend(data)
            results, docFreq = fieldQuery(queryDict, vocabFile, offset)
            results = rank(results, docFreq, nFiles)
        else:
            q = tokenize(query)
            q = [w for w in q if w not in stopWords]
            q = stemmer.stemWords(q)
            t = simpleQuery(q, vocabFile, offset)
            results, docFreq = t[0], t[1]
            results = rank(results, docFreq, nFiles)

        if len(results) > 0:
            results = sorted(results, key=results.get, reverse=True)
            results = results[:numResults]
            for key in results:
                title, _ = fileBinarySearch(
                    0, len(titleOffSet), titleOffSet, key, titleFile, 'int')
                print(','.join([key] + [' '.join(title)]), file=outfile)
        endTime = time.time()
        print(
            "{0}, {1}".format(
                endTime - startTime,
                (endTime - startTime) / numResults),
            file=outfile)

        print('\n', file=outfile)
    outfile.close()

Exemple #45

0

Afficher le fichier

                text = None
            break
        if not res:
            if not SILENT:
                if not text:
                    print('no text')
                    #if nop:
                    #    exit()
                else:
                    print('text beyond limits:')
                    print(text)
            continue
        texts_total += 1
        with open(text_fn, 'wt', encoding='utf-8') as f:
            print(link, file=f)
            f.write(text)
        print('\r{} (of {})'.format(texts_total, utils.TEXTS_FOR_SOURCE),
              end='')
        need_enter = True
        #exit()
    if need_enter:
        print()
'''===========================================================================
Chunks creation
==========================================================================='''
_utils.make_chunks(utils.TEXTS_FOR_SOURCE)
'''===========================================================================
Tokenization
==========================================================================='''
utils.tokenize(utils.TEXTS_FOR_SOURCE, isdialog=False)

Exemple #46

0

Afficher le fichier

Fichier : run.py Projet : khushmeeet/code-rnn

            model.cuda()

        gen_text = generation(embedding, model, state, options.n,
                              options.primer)
        print(gen_text)
    else:
        lr = model_settings['learning_rate']
        layers = model_settings['layers']
        batch_size = model_settings['batch_size']
        rnn_size = model_settings['rnn_size']
        embed_size = model_settings['embed_size']
        seq_length = model_settings['seq_length']
        dropout = model_settings['dropout']
        data_size = 256  # ???

        train_x = utils.tokenize(options.train_data)
        train_x = utils.batchify(train_x, batch_size)
        num_batches = train_x.size(0) // seq_length

        if len(options.load_model) > 0:
            checkpoint = torch.load(options.load_model)
            embedding = checkpoint['embed']
            model = checkpoint['rnn']
        else:
            embedding = nn.Embedding(256, embed_size)
            model = Stacked_mLSTM(mLSTM, layers, embed_size, rnn_size,
                                  data_size, dropout)

        loss_fn = nn.CrossEntropyLoss()
        embed_optimizer = optim.Adam(embedding.parameters(), lr=lr)
        model_optimizer = optim.Adam(model.parameters(), lr=lr)

Exemple #47

0

Afficher le fichier

Fichier : interactive-eval.py Projet : schang8000/multiffn-nli

    logger = utils.get_logger()

    logger.info('Reading model')
    sess = tf.InteractiveSession()
    model = multimlp.MultiFeedForward.load(args.load, sess)
    word_dict, embeddings = readdata.load_embeddings(args.embeddings, args.vocab,
                                                     generate=False,
                                                     load_extra_from=args.load)
    embeddings = utils.normalize_embeddings(embeddings)
    model.initialize_embeddings(sess, embeddings)
    number_to_label = {v: k for (k, v) in utils.label_map.items()}

    while True:
        sent1 = raw_input('Type sentence 1: ')
        sent2 = raw_input('Type sentence 2: ')
        tokens1 = utils.tokenize(sent1)
        tokens2 = utils.tokenize(sent2)
        vector1 = convert_tokens(tokens1, word_dict, model.max_time_steps1)
        vector2 = convert_tokens(tokens2, word_dict, model.max_time_steps2,
                                 prepend=word_dict[utils.GO])

        feeds = {model.sentence1: vector1,
                 model.sentence2: vector2,
                 model.sentence1_size: [len(tokens1)],
                 model.sentence2_size: [len(tokens2)+1],
                 model.dropout_keep: 1.0}

        answer = sess.run(model.answer, feed_dict=feeds)
        print('Model answer:', number_to_label[answer[0]])

        print()

Exemple #48

0

Afficher le fichier

Fichier : words_count.py Projet : mpipet/words_count

#!/usr/bin/python

from utils import tokenize, stdin

words_count = {}
for line in stdin():
    for word in tokenize(line, [' ', '\t', '-']):
        words_count[word] = words_count.get(word, 0) + 1

sorted_words_count = sorted(words_count.items(),
                            reverse=True,
                            key=lambda tup: tup[1])

for word in sorted_words_count:
    print("%i %s" % (word[1], word[0]))

Exemple #49

0

Afficher le fichier

from model import NerModel
import tensorflow_addons as tf_ad
import os
import numpy as np
from args_help import args
from my_log import logger

if not (os.path.exists(args.vocab_file) and os.path.exists(args.tag_file)):
    logger.info("building vocab file")
    build_vocab([args.train_path], args.vocab_file, args.tag_file)
else:
    logger.info("vocab file exits!!")

vocab2id, id2vocab = read_vocab(args.vocab_file)
tag2id, id2tag = read_vocab(args.tag_file)
text_sequences, label_sequences = tokenize(args.train_path, vocab2id, tag2id)

train_dataset = tf.data.Dataset.from_tensor_slices(
    (text_sequences, label_sequences))
train_dataset = train_dataset.shuffle(len(text_sequences)).batch(
    args.batch_size, drop_remainder=True)

logger.info("hidden_num:{}, vocab_size:{}, label_size:{}".format(
    args.hidden_num, len(vocab2id), len(tag2id)))
model = NerModel(hidden_num=args.hidden_num,
                 vocab_size=len(vocab2id),
                 label_size=len(tag2id),
                 embedding_size=args.embedding_size)
optimizer = tf.keras.optimizers.Adam(args.lr)

ckpt = tf.train.Checkpoint(optimizer=optimizer, model=model)

Exemple #50

0

Afficher le fichier

    # for tweet in tokenized_tweets:
    #     tweets.append(tweet['clean'])
    #     labels.append(tweet['class'])

    # train = pd.read_csv("../Data/imdb/labeledTrainData.tsv", header=0, delimiter="\t", quoting=3)
    train = pd.read_csv("../Data/imdb/train.tsv",
                        header=0,
                        delimiter="\t",
                        quoting=3)
    # test = pd.read_csv("../Data/imdb/testData.tsv", header=0, delimiter="\t", quoting=3)

    tokenized_train = []

    for idx, text in train.iterrows():
        # tokenized_train.append(ut.tokenize(text['review'], text['sentiment'])) # for labeledTrainData.tsv
        tokenized_train.append(ut.tokenize(text['Phrase'],
                                           text['Sentiment']))  # for train.tsv

    tweets = []
    labels = []
    for tweet in tokenized_train:
        tweets.append(tweet['clean'])
        labels.append(tweet['class'])

    partition = 5
    train_tweets, test_tweets, train_labels, test_labels = ut.crossValidation2(
        tweets, labels, partition)

    # kf = cv.KFold(n=len(tweets), n_folds=3, shuffle=True, indices=False)

    accuracyLR, precisionLR, recallLR, f_measureLR = [], [], [], []
    accuracyRF, precisionRF, recallRF, f_measureRF = [], [], [], []

Exemple #51

0

Afficher le fichier

from torch.utils.data import Dataset, DataLoader
from model import NeuralNet

with open('intents.json','r') as f:
    intents = json.load(f)

# print(intents) 
all_words = []
tags = []
xy = []

for intent in intents['intent']:
    tag = intents['tag']
    tags.append(tag)
    for pattern in intent['patterns']:
        w = tokenize(pattern)
        all_words.extend(w)
        # use extend instead of append as we don;t want array of arrays
        xy.append((w,tag))

ignore_words = ['?','!','[',']','.',',']
all_words = [stem(w) for w in all_words if w not in ignore_words]

all_words = sorted(set(all_words))
tags = sorted(set(tags))

X_train = []
y_train = []

for (sen, tag) in xy:
    bag = bow(sen,all_words)

Exemple #52

0

Afficher le fichier

Fichier : preprocess_questions.py Projet : GeraldHan/TRN

def main(args):

    nlp = spacy.load('en')
    print('Loading data')
    with open(args.input_questions_json, 'r') as f:
        questions = json.load(f)['questions']

    # Either create the vocab or load it from disk
    if args.input_vocab_json == '' or args.expand_vocab == 1:
        print('Building vocab')
        if 'answer' in questions[0]:
            answer_token_to_idx = build_vocab((q['answer'] for q in questions))
        question_token_to_idx = build_vocab((q['question'] for q in questions),
                                            min_token_count=args.unk_threshold,
                                            punct_to_keep=[';', ','],
                                            punct_to_remove=['?', '.'],
                                            add_special=True)
        # all_program_strs = []
        # for q in questions:
        #   if 'program' not in q: continue
        #   program_str = program_to_strs(q['program'], args.mode)[0]
        #   if program_str is not None:
        #     all_program_strs.append(program_str)
        # program_token_to_idx = build_vocab(all_program_strs, add_special=True)
        vocab = {
            'question_token_to_idx': question_token_to_idx,
            # 'program_token_to_idx': program_token_to_idx,
            'answer_token_to_idx': answer_token_to_idx,  # no special tokens
        }

    if args.input_vocab_json != '':
        print('Loading vocab')
        if args.expand_vocab == 1:
            new_vocab = vocab
        with open(args.input_vocab_json, 'r') as f:
            vocab = json.load(f)
        if args.expand_vocab == 1:
            num_new_words = 0
            for word in new_vocab['question_token_to_idx']:
                if word not in vocab['question_token_to_idx']:
                    print('Found new word %s' % word)
                    idx = len(vocab['question_token_to_idx'])
                    vocab['question_token_to_idx'][word] = idx
                    num_new_words += 1
            print('Found %d new words' % num_new_words)

    if args.output_vocab_json != '':
        with open(args.output_vocab_json, 'w') as f:
            json.dump(vocab, f, indent=4)

    # Encode all questions and entities
    print('Encoding data')
    questions_encoded = []
    orig_idxs = []
    image_idxs = []
    answers = []
    questions_len = []
    questions_mask = []
    noun_chunk_starts = []
    noun_chunk_ends = []
    entity_masks = []
    max_entity_length = 5

    for orig_idx, q in enumerate(questions):
        question = q['question'].replace('?', '').replace('.', '').replace(
            ';', ' ;').replace(',', ' ,')

        doc = nlp(question)
        start, end = find_noun_chunks(doc)
        noun_chunk_starts.append(start[:max_entity_length])
        noun_chunk_ends.append(end[:max_entity_length])

        orig_idxs.append(orig_idx)
        image_idxs.append(q['image_index'])
        question_tokens = tokenize(question)

        question_encoded = encode(question_tokens,
                                  vocab['question_token_to_idx'],
                                  allow_unk=args.encode_unk == 1)

        questions_encoded.append(question_encoded)
        questions_len.append(len(question_encoded))

        if 'answer' in q:
            answers.append(vocab['answer_token_to_idx'][q['answer']])
        else:
            answers.append(-1)

    # Pad encoded questions and entities
    max_question_length = max(len(x) for x in questions_encoded)

    for st, ed, qe in zip(noun_chunk_starts, noun_chunk_ends,
                          questions_encoded):
        entity_masks.append(
            (np.arange(max_entity_length) < len(st)).astype(int))
        if len(st) < max_entity_length:
            # qe.append(vocab['question_token_to_idx']['<NULL>'])
            padding = [len(qe) - 1] * (max_entity_length - len(st))
            st += padding

        if len(ed) < max_entity_length:
            # qe.append(vocab['question_token_to_idx']['<NULL>'])
            padding = [len(qe)] * (max_entity_length - len(ed))
            ed += padding

        questions_mask.append(
            (np.arange(max_question_length) < len(qe)).astype(int))
        if len(qe) < max_question_length:
            # qe.append(vocab['question_token_to_idx']['<NULL>'])
            padding = [vocab['question_token_to_idx']['<NULL>']
                       ] * (max_question_length - len(qe))
            qe += padding
    questions_encoded = np.asarray(questions_encoded, dtype=np.int32)
    questions_len = np.asarray(questions_len, dtype=np.int32)
    print(questions_encoded.shape)

    entity_starts = np.asarray(noun_chunk_starts, dtype=np.int32)
    entity_ends = np.asarray(noun_chunk_ends, dtype=np.int32)
    print(entity_starts.shape)

    print('Writing')
    obj = {
        'questions': questions_encoded,
        'image_idxs': np.asarray(image_idxs),
        'orig_idxs': np.asarray(orig_idxs),
        # 'programs': programs_encoded,
        # 'program_inputs': program_inputs_encoded,
        'answers': answers,
        'questions_len': questions_len,
        'questions_mask': questions_mask,
        'e_starts': entity_starts,
        'e_ends': entity_ends,
        'e_masks': entity_masks
    }
    with open(args.output_pt_file, 'wb') as f:
        pickle.dump(obj, f)

Exemple #53

0

Afficher le fichier

all_words = data["all_words"]
tags = data["tags"]
model_state = data["model_state"]

model = NeuralNet(input_size, hidden_size, output_size).to(device)
model.load_state_dict(model_state)
model.eval()

bot_name = "Bryant's Coffee shop"
print('Type quit to exit')

while True:
    sentence = input("You: ")
    if sentence == "quit":
        break
    sentence = tokenize(sentence)
    X = bag_of_words(sentence, all_words)
    X = X.reshape(-1, X.shape[0])
    X = torch.from_numpy(X)

    output = model(X)
    # print(output)
    _, predicted = torch.max(output, dim=1)
    tag = tags[predicted.item()]
    probs = torch.softmax(output, dim=1)
    prob = probs[0][predicted.item()]

    if prob.item() > 0.75:
        for intent in intents["intents"]:
            if tag == intent["tag"]:
                print(f"{bot_name}: {random.choice(intent['responses'])}")

Exemple #54

0

Afficher le fichier

Fichier : predict.py Projet : lixuanhng/NLP_related_projects

import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
import tensorflow as tf
from model import NerModel
from utils import tokenize, read_vocab, format_result, build_embedding_matrix
import tensorflow_addons as tf_ad
from args_help import args
import json
import numpy as np

# 针对测试集完成词表字典，标签字典，文本序列长度和初始化词向量
vocab2id, id2vocab = read_vocab(args.vocab_file)
tag2id, id2tag = read_vocab(args.tag_file)
print(id2tag)
text_sequences, label_sequences, text_origin, label_origin = tokenize(
    args.test_path, vocab2id, tag2id)
# text_sequences 的维度是（159，110）
embedded_matrix = build_embedding_matrix(args.pretrain_embedding_vec, vocab2id)

# print('查看 text_sequences 的值和维度:')
# print(text_sequences.shape)
# print(type(text_sequences))

# 载入模型
optimizer = tf.keras.optimizers.Adam(args.lr)
model = NerModel(hidden_num=args.hidden_num,
                 vocab_size=len(vocab2id),
                 label_size=len(tag2id),
                 embedding_size=args.embedding_size,
                 embedding_matrix=embedded_matrix)
# restore model

Exemple #55

0

Afficher le fichier

def main(args):
    print('Loading data')
    with open(args.input_questions_json, 'r') as f:
        questions = json.load(f)['questions']

    # Either create the vocab or load it from disk
    if args.input_vocab_json == '' or args.expand_vocab == 1:
        print('Building vocab')
        if 'answer' in questions[0]:
            answer_token_to_idx = build_vocab((q['answer'] for q in questions))
        question_token_to_idx = build_vocab((q['question'] for q in questions),
                                            min_token_count=args.unk_threshold,
                                            punct_to_keep=[';', ','],
                                            punct_to_remove=['?', '.'],
                                            add_special=True)
        all_program_strs = []
        for q in questions:
            if 'program' not in q: continue
            program_str = program_to_strs(q['program'], args.mode)[0]
            if program_str is not None:
                all_program_strs.append(program_str)
        program_token_to_idx = build_vocab(all_program_strs, add_special=True)
        vocab = {
            'question_token_to_idx': question_token_to_idx,
            'program_token_to_idx': program_token_to_idx,
            'answer_token_to_idx': answer_token_to_idx,  # no special tokens
        }

    if args.input_vocab_json != '':
        print('Loading vocab')
        if args.expand_vocab == 1:
            new_vocab = vocab
        with open(args.input_vocab_json, 'r') as f:
            vocab = json.load(f)
        if args.expand_vocab == 1:
            num_new_words = 0
            for word in new_vocab['question_token_to_idx']:
                if word not in vocab['question_token_to_idx']:
                    print('Found new word %s' % word)
                    idx = len(vocab['question_token_to_idx'])
                    vocab['question_token_to_idx'][word] = idx
                    num_new_words += 1
            print('Found %d new words' % num_new_words)

    if args.output_vocab_json != '':
        with open(args.output_vocab_json, 'w') as f:
            json.dump(vocab, f, indent=4)

    # Encode all questions and programs
    print('Encoding data')
    questions_encoded = []
    programs_encoded = []
    # value_inputs, encoded by question_token_to_idx in CLEVR
    # because all valid inputs are in question vocab
    program_inputs_encoded = []
    orig_idxs = []
    image_idxs = []
    answers = []
    for orig_idx, q in enumerate(questions):
        question = q['question']

        orig_idxs.append(orig_idx)
        image_idxs.append(q['image_index'])
        question_tokens = tokenize(question,
                                   punct_to_keep=[';', ','],
                                   punct_to_remove=['?', '.'])
        question_encoded = encode(question_tokens,
                                  vocab['question_token_to_idx'],
                                  allow_unk=args.encode_unk == 1)
        questions_encoded.append(question_encoded)

        if 'program' in q:
            program = q['program']
            program_str, input_str = program_to_strs(program, args.mode)
            program_tokens = tokenize(program_str)
            program_encoded = encode(program_tokens,
                                     vocab['program_token_to_idx'])
            programs_encoded.append(program_encoded)
            # program value_inputs
            input_tokens = tokenize(input_str)
            input_encoded = encode(input_tokens,
                                   vocab['question_token_to_idx'])
            assert len(input_encoded) == len(
                program_encoded)  # input should have the same len with func
            program_inputs_encoded.append(input_encoded)
        else:
            programs_encoded.append([-1])
            program_inputs_encoded.append([-1])

        if 'answer' in q:
            answers.append(vocab['answer_token_to_idx'][q['answer']])
        else:
            answers.append(-1)

    # Pad encoded questions and programs
    max_question_length = max(len(x) for x in questions_encoded)
    for qe in questions_encoded:
        while len(qe) < max_question_length:
            qe.append(vocab['question_token_to_idx']['<NULL>'])

    if len(programs_encoded) > 0:
        max_program_length = max(len(x) for x in programs_encoded)
        for pe in programs_encoded:
            while len(pe) < max_program_length:
                pe.append(vocab['program_token_to_idx']['<NULL>'])
        for ie in program_inputs_encoded:
            while len(ie) < max_program_length:
                ie.append(vocab['question_token_to_idx']['<NULL>'])

    questions_encoded = np.asarray(questions_encoded, dtype=np.int32)
    programs_encoded = np.asarray(programs_encoded, dtype=np.int32)
    program_inputs_encoded = np.asarray(program_inputs_encoded, dtype=np.int32)
    print(questions_encoded.shape)
    print(programs_encoded.shape)
    print(program_inputs_encoded.shape)
    print('Writing')
    obj = {
        'questions': questions_encoded,
        'image_idxs': np.asarray(image_idxs),
        'orig_idxs': np.asarray(orig_idxs),
        'programs': programs_encoded,
        'program_inputs': program_inputs_encoded,
        'answers': answers,
    }
    with open(args.output_pt_file, 'wb') as f:
        pickle.dump(obj, f)

Exemple #56

0

Afficher le fichier

                        default='dictionary.pkl',
                        type=str,
                        help='path to the dictionary')

    args = parser.parse_args()

    # Turns on logging.
    import logging
    root = logging.getLogger()
    root.setLevel(logging.DEBUG)

    dictionary, rev_dict = utils.get_dictionary(args.text, args.dictionary)
    num_classes = len(dictionary)

    iterator = utils.tokenize(args.text,
                              dictionary,
                              batch_size=args.batch_size,
                              seq_len=args.seq_len)

    sess = tf.Session()
    model = SeqGAN(sess,
                   num_classes,
                   logdir=args.logdir,
                   learn_phase=args.learn_phase,
                   only_cpu=args.only_cpu)
    model.build()
    model.load(ignore_missing=True)

    for epoch in range(1, args.num_epochs + 1):
        for step in range(1, args.num_steps + 1):
            logging.info('epoch %d, step %d', epoch, step)
            model.train_batch(next(iterator))

Exemple #57

0

Afficher le fichier

def add_code_into_document(document, body):
    asts, code_hints = transform_body(body)

    flag = False

    #typed_method_call = set()
    for ast in asts:
        for mc in ast["typed_method_call"]:
            if mc:
                document.add(
                    Field("typed_method_call", mc, Field.Store.YES,
                          Field.Index.ANALYZED))
                flag = True

        for e in ast["extends"]:
            if e:
                document.add(
                    Field("extends", e, Field.Store.YES, Field.Index.ANALYZED))

        for c in ast["used_classes"]:
            if c:
                document.add(
                    Field("used_classes", c, Field.Store.YES,
                          Field.Index.ANALYZED))

        for m in ast["methods"]:
            if m:
                document.add(
                    Field("methods", m, Field.Store.YES, Field.Index.ANALYZED))
                flag = True

        for m in ast["methods_called"]:
            if m:
                document.add(
                    Field("methods_called", m, Field.Store.YES,
                          Field.Index.ANALYZED))
                flag = True

        #comment
        if "comments" in ast:
            for c in ast["comments"]:
                document.add(
                    Field("comments", utils.unescape_html(c), Field.Store.NO,
                          Field.Index.ANALYZED))

        for i in ast["class_instance_creation"]:
            if i:
                document.add(
                    Field("class_instance_creation", i, Field.Store.YES,
                          Field.Index.ANALYZED))
                flag = True

        for l in ast["literals"]:
            if l:
                document.add(StringField("literals", l, Field.Store.YES))

        #finally all the splitted words
        # for s in camel_case:
        # 	document.add( Field("camel_case_words", s.lower(), Field.Store.NO, Field.Index.NOT_ANALYZED))

    hints = []
    for h in code_hints:
        for token in utils.tokenize(h):
            if 1 < len(token) < 20:
                hints.append(token)

    for hint in set(hints):
        document.add(
            Field("code_hints", hint, Field.Store.YES, Field.Index.ANALYZED))

    return flag

Exemple #58

0

Afficher le fichier

Fichier : evaluate.py Projet : shivam13juna/seq2seq_spelling_correction

error_rate = 0.6
reverse = True
model_path = './models/seq2seq.h5'
hidden_size = 512
sample_mode = 'argmax'
data_path = './data'
books = [
    'nietzsche.txt', 'pride_and_prejudice.txt', 'shakespeare.txt',
    'war_and_peace.txt'
]

test_sentence = 'The rabbit-hole went straight on like a tunnel for some way, and then dipped suddenly down, so suddenly that Alice had not a moment to think about stopping herself before she found herself falling down a very deep well.'

if __name__ == '__main__':
    text = read_text(data_path, books)
    vocab = tokenize(text)
    vocab = list(filter(None, set(vocab)))
    # `maxlen` is the length of the longest word in the vocabulary
    # plus two SOS and EOS characters.
    maxlen = max([len(token) for token in vocab]) + 2
    train_encoder, train_decoder, train_target = transform(
        vocab, maxlen, error_rate=error_rate, shuffle=False)

    tokens = tokenize(test_sentence)
    tokens = list(filter(None, tokens))
    nb_tokens = len(tokens)
    misspelled_tokens, _, target_tokens = transform(tokens,
                                                    maxlen,
                                                    error_rate=error_rate,
                                                    shuffle=False)

Exemple #59

0

Afficher le fichier

Fichier : vocab.py Projet : davidsvaughn/dts-tf

 def tokenize(line, lower=True, flat=False, clean=True):
     if clean: line = Vocab.clean_line(line)
     toks = U.tokenize(line, lower=lower, flat=flat)
     return toks

Exemple #60

0

Afficher le fichier

    # choose the first half of files based on a deterministic random range
    robj = random.Random(12345)
    robj.shuffle(files)

    if args.command == 'train':
        fileSubset = files[:len(files) / 2]
    elif args.command == 'test':
        fileSubset = files[len(files) / 2:]
    else:
        fileSubset = files[len(files) / 2:]

    if args.command == 'train' or args.command == 'test':
        for i, name in enumerate(fileSubset):
            if i % 1000 == 0:
                print '%d files done' % i
            filesAndTokens.append((name, utils.tokenize(name)))
        print len(filesAndTokens)
        print sum([len(tokens) for name, tokens in filesAndTokens])

#    model = PositionDependentVectorModel(keywords, winSize=args.win,
#                                         wdim=args.dim, stepsize=args.lr,
#                                         reg=args.reg)
#    model = ConstantAttentionVectorModel(keywords, winSize=args.win,
#                                         wdim=args.dim, stepsize=args.lr,
#                                         reg=args.reg)
#    model = NonLinearVectorModel(keywords, winSize=args.win,
#                                 wdim=args.dim, zdim=args.zdim,
#                                 stepsize=args.lr,
#                                 reg=args.reg)
#    model = RnnDense(keywords, winSize=args.win,
#                    wdim=args.dim, zdim=args.zdim,