def predict_answers(data, word2vec, N):

    stop = stopwords.words('english')

    pred_answs = []
    pred_probs = [["A", "B", "C", "D"]]
    for i in range(data.shape[0]):
        #calculate word2vec for question
        q_vec = np.zeros(N, dtype=float)
        for w in tokenize(data['question'][i]):
            if w.lower() in word2vec and w.lower() not in stop:
                w2 = getword2vecval (N,w.lower(),word2vec)
                q_vec = np.add(q_vec, w2)
        q_vec = q_vec / linalg.norm(q_vec)
    
        #calculate word2vec for answers
        A_vec = np.zeros(N, dtype=float)
        B_vec = np.zeros(N, dtype=float)
        C_vec = np.zeros(N, dtype=float)
        D_vec = np.zeros(N, dtype=float)
        for w in tokenize(data['answerA'][i]):
            if w.lower() in word2vec  and w.lower() not in stop:
                w2 = getword2vecval (N,w.lower(),word2vec)
                #print (w2[0:4])
                A_vec = np.add(A_vec,w2)
    
        for w in tokenize(data['answerB'][i]):
            if w.lower() in word2vec and w.lower() not in stop:
                w2 = getword2vecval (N,w.lower(),word2vec)
                #print (w2[0:4])
                B_vec = np.add(B_vec,w2)
            
        for w in tokenize(data['answerC'][i]):
            if w.lower() in word2vec and w.lower() not in stop:
                w2 = getword2vecval (N,w.lower(),word2vec)
                #print (w2[0:4])
                C_vec = np.add(C_vec,w2)

    
        for w in tokenize(data['answerD'][i]):
            if w.lower() in word2vec and w.lower() not in stop:
                w2 = getword2vecval (N,w.lower(),word2vec)
                #print (w2[0:4])
                D_vec = np.add(D_vec,w2)
    
        A_vec = A_vec / linalg.norm(A_vec) 
        B_vec = B_vec / linalg.norm(B_vec)
        C_vec = C_vec / linalg.norm(C_vec)
        D_vec = D_vec / linalg.norm(D_vec)
        
        #choose question based on cosine distance
        idx = np.concatenate((A_vec, B_vec, C_vec, D_vec)).reshape(4, N).dot(q_vec).argmax()
        probs = np.concatenate((A_vec, B_vec, C_vec, D_vec)).reshape(4, N).dot(q_vec)
        pred_answs.append(["A", "B", "C", "D"][idx])
        pred_probs.append(probs)
        
    return pred_answs, pred_probs
def get_glove_features(data, word2vec, N):
    stop = stopwords.words('english')

    scores = []
    for i in range(data.shape[0]):
        #calculate word2vec for question
        q_vec = np.zeros(N)
        for w in tokenize(data['question'][i]):
            if w.lower() in word2vec and w.lower() not in stop:
                q_vec += word2vec[w.lower()]
                
#                 # get all synonyms of the word
#                 syns = wn.synsets(w.lower(), pos='n')
#                 if len(syns)>0:
#                     for syn in syns:
#                         sw = syn.lemma_names()[0]
#                         if sw.lower() in word2vec and sw.lower() not in stop:
#                             q_vec += word2vec[sw.lower()]
        
        q_vec = q_vec / linalg.norm(q_vec)
    
        #calculate word2vec for answers
        A_vec = np.zeros(N)
        B_vec = np.zeros(N)
        C_vec = np.zeros(N)
        D_vec = np.zeros(N)
        for w in tokenize(data['answerA'][i]):
            if w.lower() in word2vec  and w.lower() not in stop:
                A_vec += word2vec[w.lower()]
        
    
        for w in tokenize(data['answerB'][i]):
            if w.lower() in word2vec and w.lower() not in stop:
                B_vec += word2vec[w.lower()]
        
            
        for w in tokenize(data['answerC'][i]):
            if w.lower() in word2vec and w.lower() not in stop:
                C_vec += word2vec[w.lower()]
        
    
        for w in tokenize(data['answerD'][i]):
            if w.lower() in word2vec and w.lower() not in stop:
                D_vec += word2vec[w.lower()]
                
    
        A_vec = A_vec / linalg.norm(A_vec) 
        B_vec = B_vec / linalg.norm(B_vec)
        C_vec = C_vec / linalg.norm(C_vec)
        D_vec = D_vec / linalg.norm(D_vec)
                
        scores.append(np.concatenate((A_vec, B_vec, C_vec, D_vec)).reshape(4, N).dot(q_vec))
        
    return scores
Exemple #3
0
def __build_dictionary(synset, hyperhypo):
    lesk_dictionary = []
    # Includes definition.
    lesk_dictionary+= tokenize(synset.definition)
    # Includes lemma_names.
    lesk_dictionary+= synset.lemma_names
    # Optional: includes lemma_names of hypernyms and hyponyms.
    if hyperhypo:
        related_senses = synset.hypernyms()+synset.hyponyms()
        for related_sense in related_senses:
            lesk_dictionary+= tokenize(related_sense.definition)
            lesk_dictionary+= [lemma.name for lemma in related_sense.lemmas]

    without_stop_words = filter(lambda word: word not in english_stopwords , lesk_dictionary)
    return map(lambda word: word.lower(), without_stop_words)
def generate_citations(lines, vocab, index):
    word2idx = dict([(v, k) for k, v in enumerate(vocab)])
    for line in lines[:100]:
        tokenized = list()
        capitalized = list()
        for word, cap in zip(utils.tokenize(line, periods=True), utils.tokenize(line, periods=True, capitalized=True)):
            if word == '.':
                if len(tokenized) > 10:
                    citation = generate_citation([word2idx[w] for w in tokenized if w in word2idx], index)
                    print(' '.join(capitalized) + ' (%s).' % citation)
                tokenized = list()
                capitalized = list()
            else:
                tokenized.append(word)
                capitalized.append(cap)
Exemple #5
0
def predict_segmented_tf_idf(data, docs_per_q, ids_and_categories):  
    #index docs
    
    
    res = []
    category_tf_idfs = {}
    for index, row in data.iterrows():


    	current_id = str(row['id'])
    	print current_id
    	current_category = ids_and_categories[current_id]

    	if category_tf_idfs.get(current_category) is None:
    		category_tf_idfs[current_category] = utils.get_docstf_idf(wiki_docs_dir + '/%s' % current_category)

    	docs_tf, words_idf = category_tf_idfs[current_category]

        #get answers words
        w_A = set(utils.tokenize(row['answerA']))
        w_B = set(utils.tokenize(row['answerB']))
        w_C = set(utils.tokenize(row['answerC']))
        w_D = set(utils.tokenize(row['answerD']))
    
        sc_A = 0
        sc_B = 0
        sc_C = 0
        sc_D = 0
    
        q = row['question']
        
        for d in zip(*utils.get_docs_importance_for_question(q, docs_tf, words_idf, docs_per_q))[0]:
            for w in w_A:
                if w in docs_tf[d]:
                    sc_A += 1. * docs_tf[d][w] * words_idf[w] # count of how many times in the document, times log(numberofdocs/word) for each word
            for w in w_B:
                if w in docs_tf[d]:
                    sc_B += 1. * docs_tf[d][w] * words_idf[w]
            for w in w_C:
                if w in docs_tf[d]:
                    sc_C += 1. * docs_tf[d][w] * words_idf[w]
            for w in w_D:
                if w in docs_tf[d]:
                    sc_D += 1. * docs_tf[d][w] * words_idf[w]

        res.append(['A','B','C','D'][np.argmax([sc_A, sc_B, sc_C, sc_D])])
        
    return res
    def matchUp(self, token, ingredientRow):
        """
        Returns our best guess of the match between the tags and the
        words from the display text.

        This problem is difficult for the following reasons:
            * not all the words in the display name have associated tags
            * the quantity field is stored as a number, but it appears
              as a string in the display name
            * the comment is often a compilation of different comments in
              the display name

        """
        ret = []

        # strip parens from the token, since they often appear in the
        # display_name, but are removed from the comment.
        token = utils.normalizeToken(token)
        decimalToken = self.parseNumbers(token)

        for key, val in ingredientRow.iteritems():
            if isinstance(val, basestring):

                for n, vt in enumerate(utils.tokenize(val)):
                    if utils.normalizeToken(vt) == token:
                        ret.append(key.upper())

            elif decimalToken is not None:
                try:
                    if val == decimalToken:
                        ret.append(key.upper())
                except:
                    pass

        return ret
Exemple #7
0
 def testTokens(self):
     tokens = utils.tokenize(self.str3)
     self.assertEqual(11, len(tokens))
     self.assertEqual('\n  two empty spaces and some escaped chars \\\"\\\' in normal textfollowed by a ', tokens[0]['token'])
     self.assertEqual('"dbl quote"', tokens[1]['token'])
     self.assertEqual(' and then a ', tokens[2]['token'])
     self.assertEqual("'single quote'", tokens[3]['token'])
     self.assertEqual('\nwait there is more!! ', tokens[4]['token'])
     self.assertEqual('"\'signle quotes\' inside a double quote"', tokens[5]['token'])
     self.assertEqual(' and ', tokens[6]['token'])
     self.assertEqual('\'"double quotes" inside a single quote\'', tokens[7]['token'])
     self.assertEqual('\nwait! there\\\'s more!! ', tokens[8]['token'])
     self.assertEqual('"escaped double quotes \\" and escaped single quotes\\\' "', tokens[9]['token'])
     self.assertEqual(' ', tokens[10]['token'])
     self.assertEqual(utils.TOKEN_NORMAL, tokens[0]['type'])
     self.assertEqual(utils.TOKEN_DBL_Q, tokens[1]['type'])
     self.assertEqual(utils.TOKEN_NORMAL, tokens[2]['type'])
     self.assertEqual(utils.TOKEN_SNG_Q, tokens[3]['type'])
     self.assertEqual(utils.TOKEN_NORMAL, tokens[4]['type'])
     self.assertEqual(utils.TOKEN_DBL_Q, tokens[5]['type'])
     self.assertEqual(utils.TOKEN_NORMAL, tokens[6]['type'])
     self.assertEqual(utils.TOKEN_SNG_Q, tokens[7]['type'])
     self.assertEqual(utils.TOKEN_NORMAL, tokens[8]['type'])
     self.assertEqual(utils.TOKEN_DBL_Q, tokens[9]['type'])
     self.assertEqual(utils.TOKEN_NORMAL, tokens[10]['type'])
def FrequentWords(data_dirs, suffixes, max_key_words):
  """
  Returns a dictionary of min(max_key_words, percentile_key_words), giving key
  word with its count.
  """
  matches = matchingFiles(data_dirs, suffixes)

  token_count = Counter()
  files_done = 0
  for file_name in matches:
    tokens = tokenize(file_name)
    for token in tokens:
      if len(token) == 0:
        continue
      try:
        token_count[token] += 1
      except:
        token_count[token] = 1
    files_done += 1
    if (files_done % 5000 == 0):
      print("Completed parsing %d files ..." % files_done)

#  num_key_words = min(max_key_words,
#                      math.ceil(percentile_key_words * len(token_count)))
  return token_count.most_common(max_key_words)
Exemple #9
0
    def tag(self, text=None):
        """
        Tags the given text.
        
        :param text: a string or unicode object. Strings assumed to be utf-8
        :returns: a list of lists (sentences with tokens).
            Each sentence has (token, tag) tuples.
        """
        result = []
        if text:
            tokens = utils.tokenize(text, clean=False)
            for sent in tokens:
                tags = self.tag_tokens(sent)
                result.append(zip(sent, tags))
        else:
            # read tsv from stdin
            sent = []
            for line in sys.stdin:
                line = line.decode('utf-8').strip()
                if line:
                    sent.append(line.split()[0])
                else:
                    tags = self.tag_tokens(sent)
                    result.append(zip(sent, tags))
                    sent = []

        return result
Exemple #10
0
    def bird_info(self):
        birdv = self.machine.run("echo | birdc | head -1").strip().replace(" ready.", "")
        birdv = birdv.split(" ")
        info = {
            "daemon":  birdv[0],
            "version": birdv[1],
            "ospf": {}
            }

        log.info("[%s] getting OSPF neighbours" % self.hostname())
        output = self.machine.run("echo show ospf neighbors | birdc | sed '/^bird[^ ] .*/d'")
        neighbours = []
        for toks in [tokenize(l) for l in splitlines(output)[2:]]:
            neighbour = {
                "routerid": toks[0]
                }
            if toks[4][0] in ascii_letters:
                neighbour["ifname"] =  toks[4]
                neighbour["v4addr"] =  toks[5]
            else:
                neighbour["v4addr"] =  toks[4]
                neighbour["ifname"] =  toks[5]
            neighbours.append(neighbour)
        info["ospf"]["neighbours"] = neighbours
        return info
    def generate_data(self, count, offset):
        """
        Generates training data in the CRF++ format for the ingredient
        tagging task
        """
        df = pd.read_csv(self.opts.data_path)
        df = df.fillna("")

        start = int(offset)
        end = int(offset) + int(count)

        df_slice = df.iloc[start: end]

        for index, row in df_slice.iterrows():
            try:
                # extract the display name
                display_input = utils.cleanUnicodeFractions(row["input"])
                tokens = utils.tokenize(display_input)
                del(row["input"])

                rowData = self.addPrefixes([(t, self.matchUp(t, row)) for t in tokens])

                for i, (token, tags) in enumerate(rowData):
                    features = utils.getFeatures(token, i+1, tokens)
                    print utils.joinLine([token] + features + [self.bestTag(tags)])

            # ToDo: deal with this
            except UnicodeDecodeError:
                pass

            print
 def classify_proba(self, text):
     token_list = tokenize(text)
     token_list = del_stopwords(token_list, self.stopset)
     wordfreq_dict = stat_wordfreq(token_list)
     dictfeats = tfidf(wordfreq_dict, self.idf_dict)
     vecfeats = self.vectorizer.transform(dictfeats).toarray()
     prob = self.classifier.predict_proba(vecfeats)
     return prob[0]
Exemple #13
0
 def macaddr(self, iface):
     output = self.machine.run("ip link show dev %s | grep link/ether" % iface).strip()
     if not output:
         return None
     mac = tokenize(output)[1].upper()
     if len(mac.replace("0", "").replace(":", "")) == 0:
         return None
     return mac
def find_similar_articles(corpus_name, method, content, data_dir=os.getcwd(), index=None):

    """
    - corpus_name : Le nom du corpus sur lequel on travaille (fichier .tsv 
        sans l'extension .tsv)
        
    - method : ldan (n = le nombre de topics), lsin ou tfidf
    
    - content : un texte
    
    Renvoie les 5 articles de corpus_name les plus proches du contenu spécifié 
    
    """

    corpus_file = os.path.join(data_dir, corpus_name + '_' + method + '.mm')
    index_file = os.path.join(data_dir, corpus_name + '_' + method + '_index')
    docid_file = os.path.join(data_dir, corpus_name + '_docid.txt')
    
    # Chargement du corpus
    try:
        corpus = corpora.mmcorpus.MmCorpus(corpus_file)
    except Exception:
        raise IOError('Impossible de charger le fichier %s. Avez-vous bien appliqué le script corpus_to_matrix.py ?' % (corpus_file))

    # Chargement du fichier d'index, s'il n'est pas fourni en argument
    if not index:
        try:
            index = similarities.docsim.Similarity.load(index_file)
        except Exception:
            raise IOError("""Impossible de charger le fichier %s. Avez-vous bien appliqué le script %s avec l'option --saveindex ?""" % (method, index_file))

    dico_file = os.path.join(data_dir, corpus_name + '_wordids.txt')

    # Chargement du dictionnaire
    try:
        id2word = corpora.dictionary.Dictionary.load_from_text(dico_file)
    except Exception:
        raise IOError("Impossible de charger le fichier %s" % (dico_file))

    # Chargement du modèle correspondant à la méthode voulue par l'utilisateur
    if method == 'tfidf':
        model_file = os.path.join(data_dir, corpus_name + '_tfidf_model')
        model = models.tfidfmodel.TfidfModel.load(model_file)

    elif method.startswith('lsi'):
        model_file = os.path.join(data_dir, corpus_name + '_' + args.method + '_model')
        model = models.lsimodel.LsiModel.load(model_file)

    elif method.startswith('lda'):
        model_file = os.path.join(data_dir, corpus_name + '_' + args.method + '_model')
        model = models.ldamodel.LdaModel.load(model_file)

    tokens = model[id2word.doc2bow(utils.tokenize(content))]

    # Renvoi des 5 articles les plus proches 
    sims = index[tokens]   
    sims = sorted(enumerate(sims), key=lambda item: -item[1])
    return json.dumps([{'id': utils.get_article_by_corpus_number(x[0], docid_file), 'score': round(x[1], 2)} for x in sims[:5]])
def word_freq(filenames, stopset):
    wordset = set()   # 全部单词集
    freqset_list = [[],[]] # 分别保存负向和正向文本的词频
    npos = 0 # 当前正向文本的数目
    nneg = 0 # 当前负向文本的数目
    icur = 0 # 当前所指向的正向或负向文本的下标
    for filename in filenames:
        fr = file(filename, 'r')
        while True:
            line = fr.readline().decode("utf-8")
            if len(line) == 0: # Zero length indicates EOF
                break
            id,label,text = proc_line(line)
            token_list = tokenize(text)
            token_list = del_stopwords(token_list, stopset)
            wordfreq_dict = {}
            for token in token_list:
                wordset.add(token) # 将单词加入全部单词集
                if wordfreq_dict.has_key(token):
                    wordfreq_dict[token] += 1
                else:
                    wordfreq_dict[token] = 1
            doc = [id, label, wordfreq_dict] # 用列表记录每篇文本的id,label和词频
            # 将文本加入指定列表
            index = 0
            if label == '1':
                index = 1
                freqset_list[1].append(doc)
                icur = npos
                npos += 1
            elif label == '-1':
                index = 0
                freqset_list[0].append(doc)
                icur = nneg
                nneg += 1
            else:
                print 'tag-unknown text'
                continue
        fr.close()
        # 将特征词保存至文件中
        f = open('./Training/WordSet.txt', 'w')
        for word in wordset:
            string = word + '\n'
            f.write(string.encode("utf-8"))
        f.close()
        # 将原始词频保存至文件中
        f = open('./Training/WordFreq_Orig.txt', 'w')
        for i in range(2):
            for freqset in freqset_list[i]:
                id = freqset[0]
                label = freqset[1]
                freq_list = freqset[2]
                string = id + '\t' + label + '\t'
                for word in freq_list:
                    string += word + ',' + str(freq_list[word]) + ';'
                string += '\n'
                f.write(string.encode('utf-8'))
    return wordset, freqset_list
Exemple #16
0
 def v4addr(self, iface):
     output = self.machine.run("ip addr show dev %s | grep '^ *inet '" % iface).strip()
     def parseaddr(a):
         a = a.strip()
         if "/" not in a:
             return a + "/32"
         return a
     tokset = [tokenize(l) for l in splitlines(output)]
     return [parseaddr(toks[1]) for toks in tokset if len(toks) > 0]
Exemple #17
0
def find_word_freq(li):
    all_tokens = [normalize(t, lowercase=False)
             for aff in li
             for t in tokenize(text_in_element(aff),
                 split_alphanum=split_alphanum)]
    freq = defaultdict(int)
    for token in all_tokens:
        freq[token] += 1
    return freq
Exemple #18
0
def dict_from_file(filename, match_case=True):
    d = defaultdict(list)
    with codecs.open(DICTS_DIR + filename, 'rb', encoding='utf8') as f:
        for line in f:
            tokens = tokenize(normalize(line, lowercase=(not match_case)),
                    split_alphanum=split_alphanum)
            for (nb, token) in enumerate(tokens):
                d[token] += [(tokens, nb)]
        return (d, match_case)
Exemple #19
0
def predict_answers(data, word2vec, N):

    stop = stopwords.words('english')

    pred_answs = []
    for i in range(data.shape[0]):
        #calculate word2vec for question
        q_vec = np.zeros(N)
        for w in tokenize(data['question'][i]):
            if w.lower() in word2vec and w.lower() not in stop:
                q_vec += word2vec[w.lower()]
        q_vec = q_vec / linalg.norm(q_vec)
    
        #calculate word2vec for answers
        A_vec = np.zeros(N)
        B_vec = np.zeros(N)
        C_vec = np.zeros(N)
        D_vec = np.zeros(N)
        for w in tokenize(data['answerA'][i]):
            if w.lower() in word2vec  and w.lower() not in stop:
                A_vec += word2vec[w.lower()]
    
        for w in tokenize(data['answerB'][i]):
            if w.lower() in word2vec and w.lower() not in stop:
                B_vec += word2vec[w.lower()]
            
        for w in tokenize(data['answerC'][i]):
            if w.lower() in word2vec and w.lower() not in stop:
                C_vec += word2vec[w.lower()]
    
        for w in tokenize(data['answerD'][i]):
            if w.lower() in word2vec and w.lower() not in stop:
                D_vec += word2vec[w.lower()]
    
        A_vec = A_vec / linalg.norm(A_vec) 
        B_vec = B_vec / linalg.norm(B_vec)
        C_vec = C_vec / linalg.norm(C_vec)
        D_vec = D_vec / linalg.norm(D_vec)
        
        #choose question based on cosine distance
        idx = np.concatenate((A_vec, B_vec, C_vec, D_vec)).reshape(4, N).dot(q_vec).argmax()
        pred_answs.append(["A", "B", "C", "D"][idx])
        
    return pred_answs
def build_vocab(docs, save_as):
    start = time.time()
    vocab = set()
    for file in utils.iterate_corpus(docs):
        with open(file, 'r') as f:
            tokenized = itertools.chain.from_iterable(utils.tokenize(line) for line in f.readlines())
        vocab.update(tokenized)
    vocab = list(vocab)
    pkl.dump(vocab, open(save_as, 'wb'))
    print('Built vocabulary and saved it to "%s" in %s' % (save_as, utils.strtime(time.time() - start)), file=sys.stderr)
    return vocab
def predict(data, docs_per_q):  
    #index docs
    docs_tf, words_idf = utils.get_docstf_idf(wiki_docs_dir)
    
    res = []
    f = []
    for index, row in data.iterrows():
        #get answers words 
        w_A = set(utils.tokenize(row['answerA']))
        w_B = set(utils.tokenize(row['answerB']))
        w_C = set(utils.tokenize(row['answerC']))
        w_D = set(utils.tokenize(row['answerD']))
    
        sc_A = 0
        sc_B = 0
        sc_C = 0
        sc_D = 0
    
        q = row['question']
        
        for d in zip(*utils.get_docs_importance_for_question(q, docs_tf, words_idf, docs_per_q))[0]:
            for w in w_A:
                if w in docs_tf[d]:
                    sc_A += 1. * docs_tf[d][w] * words_idf[w]
            for w in w_B:
                if w in docs_tf[d]:
                    sc_B += 1. * docs_tf[d][w] * words_idf[w]
            for w in w_C:
                if w in docs_tf[d]:
                    sc_C += 1. * docs_tf[d][w] * words_idf[w]
            for w in w_D:
                if w in docs_tf[d]:
                    sc_D += 1. * docs_tf[d][w] * words_idf[w]

        res.append(['A','B','C','D'][np.argmax([sc_A, sc_B, sc_C, sc_D])])
        f.append([sc_A, sc_B, sc_C, sc_D])        
     
    features = np.array(f)
    pd.DataFrame({'id': list(data['id']),'fA': features[:,0], 'fB': features[:,1], 'fC': features[:,2], 'fD': features[:,3]})[['id', 'fA', 'fB', 'fC', 'fD']].to_csv('features_ck12.csv', index = False)
    
    return res
    def __init__(self, data, minimum_vocab_fraction=.02, include_ngrams=True):
        self.doc_freq = FreqDist()
        for count, (label, text) in enumerate(data, start=1):
            for word in set(utils.tokenize(text, include_ngrams, limit_ngrams=True)):
                self.doc_freq.inc(word)
        self.doc_count = count

        self.min_vocab_freq = 1
        self.max_vocab_freq = .95 * self.doc_count
        print 'Min/max vocabulary frequency:', self.min_vocab_freq, self.max_vocab_freq

        self.features = sorted(filter(self._is_valid_feature, self.doc_freq))
def load_mol_data(calc_set, opt_set, struct_set, prop_set=None):
    '''
    Load data from data sets and return lists of structure names, full paths
    to the geometry data, the properties, and the meta data.
    '''
    print "Dataset options used"
    print "\tCalculation methods:", calc_set
    print "\tOptimization methods:", opt_set
    print "\tStructure sets:", struct_set
    print "\tProperties:", prop_set
    names = []
    datasets = []
    geom_paths = []
    properties = []
    meta = []
    lengths = []

    for j, base_path in enumerate(opt_set):
        for i, file_path in enumerate(calc_set):
            for m, atom_set in enumerate(struct_set):
                path = os.path.join(DATA_BASE_DIR, "mol_data", base_path, atom_set, file_path)
                with open(path + ".txt", 'r') as f:
                    for line in f:
                        temp = line.split()
                        name, props = temp[0], temp[1:]

                        names.append(name)
                        datasets.append((base_path, file_path, atom_set))

                        geom_path = os.path.join(DATA_BASE_DIR, "mol_data", base_path, 'geoms', 'out', name + '.out')
                        geom_paths.append(geom_path)

                        properties.append([float(x) for x in props])

                        # Add part to feature vector to account for the 4 different data sets.
                        base_part = [i == k for k, x in enumerate(opt_set)]
                        # Add part to feature vector to account for the 3 different methods.
                        method_part = [j == k for k, x in enumerate(calc_set)]
                        # Add part to feature vector to account for the addition of N.
                        atom_part = [m == k for k, x in enumerate(struct_set)]
                        # Add bias feature
                        bias = [1]
                        meta.append(base_part + method_part + atom_part + bias)

                        tokens = tokenize(name, explicit_flips=True)
                        aryl_count = sum([1 for x in tokens if x in ARYL])
                        lengths.append(aryl_count)

    prop_desc = (("H**O", "eV"), ("LUMO", "eV"), ("Excitation", "eV"))
    prop_vals = zip(*properties)
    prop_out = [(x, y, z) for ((x, y), z) in zip(prop_desc, prop_vals)]
    return names, datasets, geom_paths, prop_out, meta, lengths
Exemple #24
0
    def quagga_info(self):
        output = self.machine.run("zebra --version")
        info = {
            "daemon": "Quagga",
            "version": tokenize(splitlines(output)[0])[-1],
            "ospf": {}
            }

        neighbours = []
        log.info("[%s] getting OSPF neighbours" % self.hostname())
        output = self.machine.run("echo show ip ospf neighbor | vtysh | grep '^[1-9]'")
        for toks in [tokenize(l) for l in splitlines(output)]:
            if len(toks) == 0:
                continue
            neighbour = {
                "routerid": toks[0],
                "v4addr":   toks[4],
                "ifname":   toks[5].split(":")[0]
                }
            neighbours.append(neighbour)
        info["ospf"]["neighbours"] = neighbours
        return info
def build_index(docs, vocab, save_as):
    start = time.time()
    word2idx = dict([(v, k) for k, v in enumerate(vocab)])
    tf = dict([(i, list()) for i in xrange(len(vocab))])
    df = Counter()
    n_docs = len(list(utils.iterate_corpus(docs)))
    files = list()
    for i, file in enumerate(utils.iterate_corpus(docs)):
        print('%d/%d %s' % (i+1, n_docs, utils.strtime(time.time() - start)), file=sys.stderr, end='\r')
        files.append(file)
        with open(file, 'r') as f:
            text = f.read()
            word_counts = Counter(word2idx[w] for w in utils.tokenize(text))
            df.update(word2idx[w] for w in set(utils.tokenize(text)))
            n_words = utils.counter_sum(word_counts)
            for word, count in word_counts.items():
                tf[word].append((count / math.log(n_words), i))
    for word, docs in tf.items():
        docs.sort(key=lambda x: x[0], reverse=True)
    tfidf = tf, df, files
    pkl.dump(tfidf, open(save_as, 'wb'))
    print('Processed %d documents in %s' % (n_docs, utils.strtime(time.time() - start)), file=sys.stderr)
    return tfidf
Exemple #26
0
 def tag(self, text):
     """
     Tags the given text.
     
     :param text: a string or unicode object. Strings assumed to be utf-8
     :returns: a list of lists (sentences with tokens). Each sentence has (token, tag) tuples.
     """
     tokens = utils.tokenize(text, clean=False)
     result = []
     for sent in tokens:
         tags = self.tag_tokens(sent)
         result.append(zip(sent, tags))
     
     return result
 def clusterSentence(self, sentence):
     """
     clusters the given sentence with existing cluster or creates a
     new cluster.
     sentence - sentence to be clustered
     """
     words = utils.tokenize(sentence.lower())
     lems = utils.lemmatize(words)
     terms = utils.filterStopWords(lems)
     tf = dict(Counter(terms))
     self.clusterize(tf, sentence)  
  
     # Every time a new sentence is clusterized, save latest clusters
     self.saveClusters()
def predict(data, docs_per_q):  
    #index docs
    docs_tf, words_idf = utils.get_docstf_idf(wiki_docs_dir)
    
    res = []
    doc_score = [["A","B","C","D"]]
    for index, row in data.iterrows():
        #get answers words
        w_A = set(utils.tokenize(row['answerA']))
        w_B = set(utils.tokenize(row['answerB']))
        w_C = set(utils.tokenize(row['answerC']))
        w_D = set(utils.tokenize(row['answerD']))
    
        sc_A = 0
        sc_B = 0
        sc_C = 0
        sc_D = 0
    
        q = row['question']
        
        for d in list(zip(*utils.get_docs_importance_for_question(q, docs_tf, words_idf, docs_per_q)))[0]:
            for w in w_A:
                if w in docs_tf[d]:
                    sc_A += 1. * docs_tf[d][w] * words_idf[w]
            for w in w_B:
                if w in docs_tf[d]:
                    sc_B += 1. * docs_tf[d][w] * words_idf[w]
            for w in w_C:
                if w in docs_tf[d]:
                    sc_C += 1. * docs_tf[d][w] * words_idf[w]
            for w in w_D:
                if w in docs_tf[d]:
                    sc_D += 1. * docs_tf[d][w] * words_idf[w]

        res.append(['A','B','C','D'][np.argmax([sc_A, sc_B, sc_C, sc_D])])
        doc_score.append([sc_A, sc_B, sc_C, sc_D])
    return res, doc_score
def find_similar_articles(corpus_name, method, id=None, content=None):

    corpus_file = corpus_name + '_' + method + '.mm'
    index_file = corpus_name + '_' + method + '_index'
    docid_file = corpus_name + '_docid.txt'

    try:
        corpus = corpora.mmcorpus.MmCorpus(corpus_file)
    except Exception:
        raise IOError('Impossible de charger le fichier %s' % (corpus_file))

    try:
        index = similarities.docsim.Similarity.load(index_file)
    except Exception:
        raise IOError('Impossible de charger le fichier %s' % (index_file))

    if id is not None:  
        corpus_id = utils.get_article_by_id(id, docid_file)
        tokens = corpus[corpus_id]

    elif content is not None:
        dico_file = corpus_name + '_wordids.txt'

        try:
            id2word = corpora.dictionary.Dictionary.load_from_text(dico_file)
        except Exception:
            raise IOError("Impossible de charger le fichier %s" % (dico_file))

        if method == 'tfidf':
            model_file = corpus_name + '_tfidf_model'
            model = models.tfidfmodel.TfidfModel.load(model_file)

        elif method.startswith('lsi'):
            model_file = corpus_name + '_' + args.method + '_model'
            model = models.lsimodel.LsiModel.load(model_file)

        elif method.startswith('lda'):
            model_file = corpus_name + '_' + args.method + '_model'
            model = models.ldamodel.LdaModel.load(model_file)

        tokens = model[id2word.doc2bow(utils.tokenize(content))]

    else:
        raise Exception("Il faut fournir un id ou un contenu")

    sims = index[tokens]   
    sims = sorted(enumerate(sims), key=lambda item: -item[1])

    return [(utils.get_article_by_corpus_number(x[0], docid_file), x[1]) for x in sims[:5]]
Exemple #30
0
 def tag(self, text, no_repeats=False):
     """
     Runs the SRL process on the given text.
     
     :param text: unicode or str encoded in utf-8.
     :param no_repeats: whether to prevent repeated argument labels
     :returns: a list of SRLAnnotatedSentence objects
     """
     tokens = utils.tokenize(text, clean=False)
     result = []
     for sent in tokens:
         tagged = self.tag_tokens(sent)
         result.append(tagged)
     
     return result
Exemple #31
0
from collections import Counter

import nltk
import numpy as np
import pandas

# noinspection PyUnresolvedReferences
from utils import tokenize

# importing corpus as resume
resume_file = open('../assets/resume.txt', 'r')
resume = resume_file.read().lower()
resume_file.close()

# tokenizing the resume
tokens = tokenize(resume)

# dividing corpus into 6 documents
k = len(tokens) // 6
documents = []
for i in range(5):
    documents.append(tokens[i * k:(i + 1) * k])
documents.append(tokens[5 * k:])

# calculating most common 5 tokens from each document and storing frequency tables for each document
most_common = set()
document_frequencies = []
for document in documents:
    frequencies = Counter(document)
    document_frequencies.append(frequencies)
    for word, frequency in frequencies.most_common(5):
Exemple #32
0
    def read_training_dataset(self, input_path):
        with open(input_path) as f:

            data = json.load(f)
            self.no_samples = len(data)

            # for padding.
            self.words_converter.T2id('<PAD>')

            self.words_converter.T2id('<SOS>')

            self.slots_converter.T2id('<PAD>')
            self.slots_converter.T2id('<SOS>')

            self.slots_converter.T2id('-')

            for i in tqdm(range(self.no_samples)):

                entry = data[str(i)]

                text = entry["text"]
                text = normalizeString(text)
                tokens = tokenize(text)
                self.stcs_literals.append(tokens)
                tokens_id = [self.words_converter.T2id(id) for id in tokens]
                tokens_id.append(self.words_converter.T2id('</s>'))
                self.stcs.append(tokens_id)
                self.lengths.append(len(tokens_id))

                intent = entry["intent"]

                self.intents.append(self.intent_converter.T2id(intent))

                slots_dictionary = entry["slots"]
                # +1 make room for <SOS>
                slots_id = [self.slots_converter.T2id('-')] * len(tokens_id)
                slots_id[0] = self.slots_converter.T2id('<SOS>')

                no_slots_in_stc = 0
                for slot, target_words in slots_dictionary.items():
                    target_words = normalizeString(target_words)
                    target_word_list = tokenize(target_words)
                    for word in target_word_list:
                        no_slots_in_stc += 1
                        try:
                            idx = tokens.index(word)
                        except:
                            idx = [
                                i for i, s in enumerate(tokens) if word in s
                            ][0]

                        # +1 account for <SOS>
                        slots_id[idx + 1] = self.slots_converter.T2id(slot)

                # keep count of no slots
                for j in range(len(tokens_id) - no_slots_in_stc):
                    self.slots_converter.T2id('-')

                self.slots.append(slots_id)
                # self.slots.append(torch.tensor(slots_id, dtype=torch.long, device=self.device))

            # add padding

            ncols = max(self.lengths)

            self.X = self.stcs
            self.Y = self.slots
# import libraries
import pandas as pd
from sqlalchemy import create_engine
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from utils import tokenize


print('Loading data...')
engine = create_engine('sqlite:///data/DisasterResponse.db')
df = pd.read_sql_table('disaster_message_category', engine)

print('Tokenizing words...')
word_string = " ".join(df['message'])
word_string_final = " ".join(tokenize(word_string))

print('Creating wordcloud...')
wordcloud = WordCloud(width=800,
                      height=400,
                      background_color='white',
                      max_words=300).generate(word_string_final)

print('Generating png image...')
# plot the WordCloud image
plt.figure(figsize=(8, 4), facecolor=None)
plt.imshow(wordcloud)
plt.axis("off")
plt.tight_layout(pad = 0)
plt.savefig('app/static/images/wordcloud.png', dpi=105)
    def qe(self, sourceLang, targetLang, sourceText, targetText):
        """
        Performs translation quality estimation on sourceText to targetText using QuEst++ and fast_align
        It's ok to raise Exceptions here. They are handled upstream.
        """
        os.makedirs('data/tmp', exist_ok=True)

        if not [sourceLang, targetLang] in self.supportedPairs:
            raise Exception("{}-{} language pair not supported".format(
                sourceLang, targetLang))

        # Sanitize input
        aligned = hunalign(sourceText, targetText)
        sourceText = [tokenize(x[0], sourceLang, False) for x in aligned]
        targetText = [tokenize(x[1], sourceLang, False) for x in aligned]
        sourceTextPlain = '\n'.join([' '.join(x) for x in sourceText])
        targetTextPlain = '\n'.join([' '.join(x) for x in targetText])

        alignments = fast_align.FastAlign().align(sourceLang, targetLang,
                                                  sourceTextPlain,
                                                  targetTextPlain)['alignment']
        with open('data/tmp/alignments', 'w') as fileAlignments:
            fileAlignments.write(alignments)

        with open('data/tmp/source', 'w') as fileSource:
            fileSource.write(sourceTextPlain)

        with open('data/tmp/target', 'w') as fileTarget:
            fileTarget.write(targetTextPlain)

        with DirCrawler('qe/questplusplus'):
            print("Extracting features")
            (_output, _error) = bash("""
                 java -cp QuEst++.jar:lib/* shef.mt.WordLevelFeatureExtractor
                 -lang english spanish
                 -input ../../data/tmp/source ../../data/tmp/target
                 -alignments ../../data/tmp/alignments
                 -config ../questplusplus-config/config.word-level.properties
                 """)

            outputFile = 'output/test/output.txt'
            if not os.path.isfile(outputFile):
                raise Exception('Server Processing Error')
            with open(outputFile, 'r') as outputFileR:
                features = outputFileR.readlines()

        os.remove('data/tmp/alignments')
        os.remove('data/tmp/source')
        os.remove('data/tmp/target')

        features = [[
            x.split('=')[1] for x in line.rstrip('\n').rstrip('\t').split('\t')
        ] for line in features]
        with open('data/tmp/features', 'w') as fileFeatures:
            fileFeatures.write('\n'.join(['\t'.join(x) for x in features]))
        with open('data/tmp/labels', 'w') as fileLabels:
            fileLabels.write('\n'.join(['1'] * len(features)))

        with DirCrawler('qe/questplusplus'):
            print("Removing output directory structure for feature extractor")
            os.remove(outputFile)
            os.rmdir('output/test')
            os.rmdir('output')

            print("Machine Learning")
            (_output, _error) = bash(f"""
                python learning/src/learn_model.py ../questplusplus-config/svr_{sourceLang}_{targetLang}.cfg
                """)

            with open('predicted.csv', 'r') as predictedFile:
                output = [
                    float(x.rstrip('\n').split('\t')[1])
                    for x in predictedFile.readlines()
                ]
            os.remove('predicted.csv')

        os.remove('data/tmp/features')
        os.remove('data/tmp/labels')
        os.rmdir('data/tmp')
        return {'status': 'OK', 'qe': output}
    # Evaluation setup
    'sample': '如',
    'max_sample_length': 50,
    'sample_range':
    2  # how many words in the dictionary to be considered when sampling
}

# -------------------------Data feeding preparation---------------
# Read and tokenize data
texts = [
    './data/qts_tab.txt', './data/qsc_tab.txt', './data/qtais_tab.txt',
    './data/qss_tab.txt'
]
# max and min length of poem sequence
maxlen = 100
minlen = 7
poems = []
# for t in texts:
#     poems.extend(utils.read_poem(t))
for t in texts:
    poems.extend(utils.read_regular_poem(t))

poems = utils.chop_poems(poems, maxlen, minlen)
data, count, dictionary, reverse_dictionary = utils.tokenize(
    poems, params['vocabulary_size'])

rnnlm = language_model.RNNLM(params, data, count, dictionary,
                             reverse_dictionary)
rnnlm.train(sample_interval=100, save_interval=5000, logger=None)
# rnnlm.sample(sample_len=100, checkpoint_dir='./tmp/rnndata/')
Exemple #36
0
data = pd.read_csv('../data/data.csv', skiprows=0)
filtered = data[[
    'REGI', 'TYPO', 'VISUAL_SIMILARITY', 'SOUNDEX_DISTANCE'
]][(data['EDIT_DISTANCE'] == 1) & (data['IS_TYPO'] == 1)
   & ((data['VISUAL_SIMILARITY'] >= 0.8) | (data['SOUNDEX_DISTANCE'] <= 1))]
filtered = filtered[filtered.TYPO.map(lambda x: x.count('.')) == 2]
filtered = filtered[filtered.REGI.map(lambda x: x.count('.')) == 2]
filtered.reset_index(drop=True, inplace=True)

reg_list = list()
typo_list = list()
for i in range(t.shape[0]):
    reg_list.append(filtered['REGI'][i].split('.')[0])
    typo_list.append(filtered['TYPO'][i].split('.')[0])

in_list, out_list = utils.tokenize(reg_list, typo_list, token_size)

in_vocab = set()
out_vocab = set()
for name in in_list:
    for char in name:
        in_vocab.add(char)
for name in out_list:
    for char in name:
        out_vocab.add(char)
vocab = in_vocab.union(out_vocab)
num_encoder_tokens = len(in_vocab)
num_decoder_tokens = len(out_vocab)
max_encoder_seq_length = max([len(name) for name in in_list])
max_decoder_seq_length = max([len(name) for name in out_list])
Exemple #37
0
def html_to_json(url):
    category, uid = tokenize(url)
    schema_name = 'schema/{}.json'.format(category)
    with open(schema_name, 'rb') as fp:
        template = json.load(fp)
    html_doc = get_html(url)
    soup = BeautifulSoup(html_doc, 'html.parser')

    table_title = None
    result = {}
    ignore_image = True
    for tr in soup.find_all('tr'):
        # keep only the most bottom level tr
        if tr.find_all('tr'):
            continue
        is_title_row = False
        row_content = []
        for td in tr.find_all('td'):
            if ignore_image and td.find_all('img'):
                continue
            text = clean_up(td.text)
            if text in template:
                table_title = text
                is_title_row = True
                row_titles = template[table_title]
                ignore_image = row_titles['ignore image']
                result[table_title] = {}
                break
            link = ''
            for a in td.find_all('a'):
                link = a.get('href')
            row_content.append({'text': text, 'link': link})

        if is_title_row:
            continue

        if not row_content or not table_title:
            continue

        column_index = row_titles['column index']
        strict_match = row_titles['strict match']
        regex_match = row_titles['regex match']
        terminate_on_mismatch = row_titles['terminate on mismatch']

        matched = False
        if len(row_content) > column_index + 1:
            candidate_row_title = row_content[column_index]['text']
            for s in strict_match:
                if s == candidate_row_title and s not in result[table_title]:
                    matched = True
                    result[table_title][s] = row_content[column_index + 1:]
                    break
            if not matched:
                for s in regex_match:
                    if s in candidate_row_title:
                        matched = True
                        result[table_title][u'Certified Votes'] = row_content[column_index + 1:]
                        break
                    if re.match(s, candidate_row_title):
                        matched = True
                        category, race_id = tokenize(row_content[column_index + 1]['link'])
                        result[table_title][race_id] = row_content[column_index:]
                        break
        if terminate_on_mismatch and not matched:
            table_title = None
            ignore_image = True
    return result
    args = parser.parse_args()

    logging.basicConfig(filename=args.log_filepath,
                        format='%(asctime)s : %(levelname)s : %(message)s',
                        level=logging.INFO)

    if args.command == "construct_vocab":

        dictionary = corpora.Dictionary()

        count = 0

        line = sys.stdin.readline()
        while line:

            tokens = tokenize(line)

            dictionary.add_documents([tokens], prune_at=None)
            count += 1

            if count % 100000 == 0:
                print_err("line %d %d" % (count, len(dictionary)))

            line = sys.stdin.readline()

        dictionary.save(args.vocabulary_filename)
        dictionary.save_as_text(args.vocabulary_filename + ".txt")

    elif args.command == "construct_corpus":
        # use glob to recurse under data/TXTs directory
def convert_filename(filename):
    tokenized_filename = tokenize(filename.replace('/', '.'))
    return f'{constants.FILE_START} {tokenized_filename} {constants.FILE_END}\n'
Exemple #40
0
from nltk.corpus import semcor
import utils

count = 0
num_sentences = 0
for i in range(100):
    sent = semcor.xml('brown2/tagfiles/br-n12.xml').findall('context/p/s')[i]

    sentence = ""
    name = ""

    for wordform in sent.getchildren():
        sentence += wordform.text + " "
        if wordform.get('pos') == "NN" and wordform.text != "anyone":
            name = wordform.text
            sense_key = wordform.get('lexsn')

    context = utils.tokenize(sentence)

    if name is not "":
        best_sense = utils.find_synset(context,name)
        num_sentences += 1
        if sense_key == best_sense.lemmas()[0].key()[-9:]:
            count += 1
    if num_sentences == 50:
        break

print("accuracy = " + str(count*100/num_sentences) + " %")
# http://arxiv.org/abs/1410.4615
# "Sequence to Sequence Learning with Neural Networks"
# https://arxiv.org/abs/1409.3215
reverse = True

data_path = './data'
train_books = [
    'nietzsche.txt', 'pride_and_prejudice.txt', 'shakespeare.txt',
    'war_and_peace.txt', 'botanical_2.txt', 'botanical_terms.txt'
]
val_books = ['wonderland.txt', 'botanical_1.txt']

if __name__ == '__main__':
    # Prepare training data.
    text = read_text(data_path, train_books)
    vocab = tokenize(text)
    vocab = list(filter(None, set(vocab)))

    # `maxlen` is the length of the longest word in the vocabulary
    # plus two SOS and EOS characters.
    maxlen = max([len(token) for token in vocab]) + 2
    train_encoder, train_decoder, train_target = transform(
        vocab, maxlen, error_rate=error_rate, shuffle=False)
    print(train_encoder[:10])
    print(train_decoder[:10])
    print(train_target[:10])

    input_chars = set(' '.join(train_encoder))
    target_chars = set(' '.join(train_decoder))
    nb_input_chars = len(input_chars)
    nb_target_chars = len(target_chars)
Exemple #42
0
        #                                  .replace('ё', 'ё') \
        #                                  .strip()
        line = utils.norm_text2(re2.sub('', line))
        if line:
            lines.append(' '.join(line.split()))
    if len(lines) >= _utils.MIN_TEXT_LINES:
        texts_total += 1
        if link_no > start_link_idx:
            with open(page_fn, 'wt', encoding='utf-8') as f:
                print(link, file=f)
                f.write(page)
        with open(text_fn, 'wt', encoding='utf-8') as f:
            print(link, file=f)
            print(header, file=f)
            f.write('\n'.join(lines))
        print('\r{} (of {})'.format(texts_total,
                                    min(utils.TEXTS_FOR_SOURCE, num_links)),
              end='')
        need_enter = True
    #exit()
if need_enter:
    print()
'''===========================================================================
Chunks creation
==========================================================================='''
_utils.make_chunks(num_links)
'''===========================================================================
Tokenization
==========================================================================='''
utils.tokenize(num_links, isdialog=False)
Exemple #43
0
def generate_repo_dataset(fullname, branch, sha_list, repo_dir, writer):
    repo = Repo(repo_dir)
    total_cnt, current_cnt, msg_skip, diff_skip, word_skip = 0, 0, 0, 0, 0
    index_list, origin_target_list, target_list, origin_line_list, line_list, origin_word_list, word_list = [],[],[],[],[],[],[]
    for sha in sha_list:
        commit = repo.commit(sha)
        total_cnt += 1
        commit_msg = commit.message
        sentences = split_sentence(commit_msg)
        if not sentences:
            continue
        commit_msg = sentences[0].strip()
        commit_msg_lower = commit_msg.lower()
        if 'revert' in commit_msg_lower or commit_msg_lower.startswith('merge '):
            msg_skip += 1
            continue
        commit_msg = remove_redundant_white_space(commit_msg.strip())
        origin_commit_msg = commit_msg
        if not commit_msg:
            msg_skip += 1
            continue
        
        commit_msg = tokenize(commit_msg)
        commit_msg = remove_last_special_char(commit_msg.strip())
        commit_msg = remove_no_english_str(commit_msg)
        commit_msg = remove_redundant_white_space(commit_msg.strip())
        commit_msg = commit_msg.strip()
        if not commit_msg:
            msg_skip += 1
            continue
        commit_words = commit_msg.split()
        # if not starts_with_verb(commit_words):
        #     msg_skip += 1
        #     continue
        if len(commit_words) > constants.TARGET_SEQ_LEN_MAX:
            msg_skip += 1
            continue
            
        line_diff = get_line_diff(repo_dir, sha)
        if not line_diff:
            diff_skip += 1
            continue
        origin_line_diff = line_diff
        line_diff = remove_no_english_str(line_diff)
        line_diff = remove_redundant_white_space(line_diff.strip())
        line_diff_words = line_diff.split()
        if not overlap_two_seq(line_diff_words, commit_words):
            diff_skip+=1
            continue

        if len(line_diff_words) > constants.SOURCE_SEQ_LEN_MAX:
            diff_skip+=1
            continue
        
        word_diff = get_word_diff(repo_dir, sha)
        if not word_diff:
            word_skip += 1
            continue
        origin_word_diff = word_diff
        word_diff = remove_no_english_str(word_diff)
        word_diff = remove_redundant_white_space(word_diff.strip())
        if not word_diff:
            word_skip += 1
            continue
        word_diff_words = word_diff.split()
        index = f'{fullname} {sha}'
        writer.write(index, origin_commit_msg, commit_msg, origin_line_diff, line_diff, origin_word_diff, word_diff)
        current_cnt+=1
    print(f'{fullname}:  {current_cnt}/{total_cnt}')
    return current_cnt
Exemple #44
0
def search(*arguments):
    print("Loading Files")
    outfile = open("./query_op.txt", 'w')
    with open(arguments[0], 'r') as f:
        queries = f.readlines()
    with open("./inverted_index/titleOffset.txt", 'r') as f:
        titleOffSet = [int(line.strip()) for line in f]
    with open("./inverted_index/offset.txt", 'r') as f:
        offset = []
        for line in f.readlines():
            try:
                offset.append(int(line.strip()))
            except BaseException:
                continue
    vocabFile = open("./inverted_index/vocab.txt", 'r')
    titleFile = open("./inverted_index/title.txt", 'r')
    with open("./inverted_index/fileNumbers.txt", 'r') as f:
        nFiles = int(f.read().strip())
    key_words = ['t:', 'b:', 'i:', 'c:', 'r:', 'l:']
    print("Starting Queries")
    numQueries = 0
    for query in queries:
        startTime = time.time()
        numQueries += 1
        query = query.strip().lower()
        numResults, query = query.split(",")
        query = query.strip()
        numResults = int(numResults)
        queryType = "Plain"
        for w in key_words:
            if w in query:
                queryType = "Field"
                break

        if queryType == "Field":
            q = re.split("(t:)|(b:)|(i:)|(c:)|(r:)|(l:)", query)
            q = [i.strip() for i in q if i is not None and i != ""]
            queryDict = defaultdict(list)
            for idx in range(0, len(q), 2):
                data = tokenize(q[idx + 1].lower())
                data = [w for w in data if w not in stopWords]
                data = stemmer.stemWords(data)
                queryDict[q[idx].split(":")[0]].extend(data)
            results, docFreq = fieldQuery(queryDict, vocabFile, offset)
            results = rank(results, docFreq, nFiles)
        else:
            q = tokenize(query)
            q = [w for w in q if w not in stopWords]
            q = stemmer.stemWords(q)
            t = simpleQuery(q, vocabFile, offset)
            results, docFreq = t[0], t[1]
            results = rank(results, docFreq, nFiles)

        if len(results) > 0:
            results = sorted(results, key=results.get, reverse=True)
            results = results[:numResults]
            for key in results:
                title, _ = fileBinarySearch(
                    0, len(titleOffSet), titleOffSet, key, titleFile, 'int')
                print(','.join([key] + [' '.join(title)]), file=outfile)
        endTime = time.time()
        print(
            "{0}, {1}".format(
                endTime - startTime,
                (endTime - startTime) / numResults),
            file=outfile)

        print('\n', file=outfile)
    outfile.close()
Exemple #45
0
                text = None
            break
        if not res:
            if not SILENT:
                if not text:
                    print('no text')
                    #if nop:
                    #    exit()
                else:
                    print('text beyond limits:')
                    print(text)
            continue
        texts_total += 1
        with open(text_fn, 'wt', encoding='utf-8') as f:
            print(link, file=f)
            f.write(text)
        print('\r{} (of {})'.format(texts_total, utils.TEXTS_FOR_SOURCE),
              end='')
        need_enter = True
        #exit()
    if need_enter:
        print()
'''===========================================================================
Chunks creation
==========================================================================='''
_utils.make_chunks(utils.TEXTS_FOR_SOURCE)
'''===========================================================================
Tokenization
==========================================================================='''
utils.tokenize(utils.TEXTS_FOR_SOURCE, isdialog=False)
Exemple #46
0
            model.cuda()

        gen_text = generation(embedding, model, state, options.n,
                              options.primer)
        print(gen_text)
    else:
        lr = model_settings['learning_rate']
        layers = model_settings['layers']
        batch_size = model_settings['batch_size']
        rnn_size = model_settings['rnn_size']
        embed_size = model_settings['embed_size']
        seq_length = model_settings['seq_length']
        dropout = model_settings['dropout']
        data_size = 256  # ???

        train_x = utils.tokenize(options.train_data)
        train_x = utils.batchify(train_x, batch_size)
        num_batches = train_x.size(0) // seq_length

        if len(options.load_model) > 0:
            checkpoint = torch.load(options.load_model)
            embedding = checkpoint['embed']
            model = checkpoint['rnn']
        else:
            embedding = nn.Embedding(256, embed_size)
            model = Stacked_mLSTM(mLSTM, layers, embed_size, rnn_size,
                                  data_size, dropout)

        loss_fn = nn.CrossEntropyLoss()
        embed_optimizer = optim.Adam(embedding.parameters(), lr=lr)
        model_optimizer = optim.Adam(model.parameters(), lr=lr)
    logger = utils.get_logger()

    logger.info('Reading model')
    sess = tf.InteractiveSession()
    model = multimlp.MultiFeedForward.load(args.load, sess)
    word_dict, embeddings = readdata.load_embeddings(args.embeddings, args.vocab,
                                                     generate=False,
                                                     load_extra_from=args.load)
    embeddings = utils.normalize_embeddings(embeddings)
    model.initialize_embeddings(sess, embeddings)
    number_to_label = {v: k for (k, v) in utils.label_map.items()}

    while True:
        sent1 = raw_input('Type sentence 1: ')
        sent2 = raw_input('Type sentence 2: ')
        tokens1 = utils.tokenize(sent1)
        tokens2 = utils.tokenize(sent2)
        vector1 = convert_tokens(tokens1, word_dict, model.max_time_steps1)
        vector2 = convert_tokens(tokens2, word_dict, model.max_time_steps2,
                                 prepend=word_dict[utils.GO])

        feeds = {model.sentence1: vector1,
                 model.sentence2: vector2,
                 model.sentence1_size: [len(tokens1)],
                 model.sentence2_size: [len(tokens2)+1],
                 model.dropout_keep: 1.0}

        answer = sess.run(model.answer, feed_dict=feeds)
        print('Model answer:', number_to_label[answer[0]])

        print()
Exemple #48
0
#!/usr/bin/python

from utils import tokenize, stdin

words_count = {}
for line in stdin():
    for word in tokenize(line, [' ', '\t', '-']):
        words_count[word] = words_count.get(word, 0) + 1

sorted_words_count = sorted(words_count.items(),
                            reverse=True,
                            key=lambda tup: tup[1])

for word in sorted_words_count:
    print("%i %s" % (word[1], word[0]))
Exemple #49
0
from model import NerModel
import tensorflow_addons as tf_ad
import os
import numpy as np
from args_help import args
from my_log import logger

if not (os.path.exists(args.vocab_file) and os.path.exists(args.tag_file)):
    logger.info("building vocab file")
    build_vocab([args.train_path], args.vocab_file, args.tag_file)
else:
    logger.info("vocab file exits!!")

vocab2id, id2vocab = read_vocab(args.vocab_file)
tag2id, id2tag = read_vocab(args.tag_file)
text_sequences, label_sequences = tokenize(args.train_path, vocab2id, tag2id)

train_dataset = tf.data.Dataset.from_tensor_slices(
    (text_sequences, label_sequences))
train_dataset = train_dataset.shuffle(len(text_sequences)).batch(
    args.batch_size, drop_remainder=True)

logger.info("hidden_num:{}, vocab_size:{}, label_size:{}".format(
    args.hidden_num, len(vocab2id), len(tag2id)))
model = NerModel(hidden_num=args.hidden_num,
                 vocab_size=len(vocab2id),
                 label_size=len(tag2id),
                 embedding_size=args.embedding_size)
optimizer = tf.keras.optimizers.Adam(args.lr)

ckpt = tf.train.Checkpoint(optimizer=optimizer, model=model)
Exemple #50
0
    # for tweet in tokenized_tweets:
    #     tweets.append(tweet['clean'])
    #     labels.append(tweet['class'])

    # train = pd.read_csv("../Data/imdb/labeledTrainData.tsv", header=0, delimiter="\t", quoting=3)
    train = pd.read_csv("../Data/imdb/train.tsv",
                        header=0,
                        delimiter="\t",
                        quoting=3)
    # test = pd.read_csv("../Data/imdb/testData.tsv", header=0, delimiter="\t", quoting=3)

    tokenized_train = []

    for idx, text in train.iterrows():
        # tokenized_train.append(ut.tokenize(text['review'], text['sentiment'])) # for labeledTrainData.tsv
        tokenized_train.append(ut.tokenize(text['Phrase'],
                                           text['Sentiment']))  # for train.tsv

    tweets = []
    labels = []
    for tweet in tokenized_train:
        tweets.append(tweet['clean'])
        labels.append(tweet['class'])

    partition = 5
    train_tweets, test_tweets, train_labels, test_labels = ut.crossValidation2(
        tweets, labels, partition)

    # kf = cv.KFold(n=len(tweets), n_folds=3, shuffle=True, indices=False)

    accuracyLR, precisionLR, recallLR, f_measureLR = [], [], [], []
    accuracyRF, precisionRF, recallRF, f_measureRF = [], [], [], []
Exemple #51
0
from torch.utils.data import Dataset, DataLoader
from model import NeuralNet

with open('intents.json','r') as f:
    intents = json.load(f)

# print(intents) 
all_words = []
tags = []
xy = []

for intent in intents['intent']:
    tag = intents['tag']
    tags.append(tag)
    for pattern in intent['patterns']:
        w = tokenize(pattern)
        all_words.extend(w)
        # use extend instead of append as we don;t want array of arrays
        xy.append((w,tag))

ignore_words = ['?','!','[',']','.',',']
all_words = [stem(w) for w in all_words if w not in ignore_words]

all_words = sorted(set(all_words))
tags = sorted(set(tags))

X_train = []
y_train = []

for (sen, tag) in xy:
    bag = bow(sen,all_words)
Exemple #52
0
def main(args):

    nlp = spacy.load('en')
    print('Loading data')
    with open(args.input_questions_json, 'r') as f:
        questions = json.load(f)['questions']

    # Either create the vocab or load it from disk
    if args.input_vocab_json == '' or args.expand_vocab == 1:
        print('Building vocab')
        if 'answer' in questions[0]:
            answer_token_to_idx = build_vocab((q['answer'] for q in questions))
        question_token_to_idx = build_vocab((q['question'] for q in questions),
                                            min_token_count=args.unk_threshold,
                                            punct_to_keep=[';', ','],
                                            punct_to_remove=['?', '.'],
                                            add_special=True)
        # all_program_strs = []
        # for q in questions:
        #   if 'program' not in q: continue
        #   program_str = program_to_strs(q['program'], args.mode)[0]
        #   if program_str is not None:
        #     all_program_strs.append(program_str)
        # program_token_to_idx = build_vocab(all_program_strs, add_special=True)
        vocab = {
            'question_token_to_idx': question_token_to_idx,
            # 'program_token_to_idx': program_token_to_idx,
            'answer_token_to_idx': answer_token_to_idx,  # no special tokens
        }

    if args.input_vocab_json != '':
        print('Loading vocab')
        if args.expand_vocab == 1:
            new_vocab = vocab
        with open(args.input_vocab_json, 'r') as f:
            vocab = json.load(f)
        if args.expand_vocab == 1:
            num_new_words = 0
            for word in new_vocab['question_token_to_idx']:
                if word not in vocab['question_token_to_idx']:
                    print('Found new word %s' % word)
                    idx = len(vocab['question_token_to_idx'])
                    vocab['question_token_to_idx'][word] = idx
                    num_new_words += 1
            print('Found %d new words' % num_new_words)

    if args.output_vocab_json != '':
        with open(args.output_vocab_json, 'w') as f:
            json.dump(vocab, f, indent=4)

    # Encode all questions and entities
    print('Encoding data')
    questions_encoded = []
    orig_idxs = []
    image_idxs = []
    answers = []
    questions_len = []
    questions_mask = []
    noun_chunk_starts = []
    noun_chunk_ends = []
    entity_masks = []
    max_entity_length = 5

    for orig_idx, q in enumerate(questions):
        question = q['question'].replace('?', '').replace('.', '').replace(
            ';', ' ;').replace(',', ' ,')

        doc = nlp(question)
        start, end = find_noun_chunks(doc)
        noun_chunk_starts.append(start[:max_entity_length])
        noun_chunk_ends.append(end[:max_entity_length])

        orig_idxs.append(orig_idx)
        image_idxs.append(q['image_index'])
        question_tokens = tokenize(question)

        question_encoded = encode(question_tokens,
                                  vocab['question_token_to_idx'],
                                  allow_unk=args.encode_unk == 1)

        questions_encoded.append(question_encoded)
        questions_len.append(len(question_encoded))

        if 'answer' in q:
            answers.append(vocab['answer_token_to_idx'][q['answer']])
        else:
            answers.append(-1)

    # Pad encoded questions and entities
    max_question_length = max(len(x) for x in questions_encoded)

    for st, ed, qe in zip(noun_chunk_starts, noun_chunk_ends,
                          questions_encoded):
        entity_masks.append(
            (np.arange(max_entity_length) < len(st)).astype(int))
        if len(st) < max_entity_length:
            # qe.append(vocab['question_token_to_idx']['<NULL>'])
            padding = [len(qe) - 1] * (max_entity_length - len(st))
            st += padding

        if len(ed) < max_entity_length:
            # qe.append(vocab['question_token_to_idx']['<NULL>'])
            padding = [len(qe)] * (max_entity_length - len(ed))
            ed += padding

        questions_mask.append(
            (np.arange(max_question_length) < len(qe)).astype(int))
        if len(qe) < max_question_length:
            # qe.append(vocab['question_token_to_idx']['<NULL>'])
            padding = [vocab['question_token_to_idx']['<NULL>']
                       ] * (max_question_length - len(qe))
            qe += padding
    questions_encoded = np.asarray(questions_encoded, dtype=np.int32)
    questions_len = np.asarray(questions_len, dtype=np.int32)
    print(questions_encoded.shape)

    entity_starts = np.asarray(noun_chunk_starts, dtype=np.int32)
    entity_ends = np.asarray(noun_chunk_ends, dtype=np.int32)
    print(entity_starts.shape)

    print('Writing')
    obj = {
        'questions': questions_encoded,
        'image_idxs': np.asarray(image_idxs),
        'orig_idxs': np.asarray(orig_idxs),
        # 'programs': programs_encoded,
        # 'program_inputs': program_inputs_encoded,
        'answers': answers,
        'questions_len': questions_len,
        'questions_mask': questions_mask,
        'e_starts': entity_starts,
        'e_ends': entity_ends,
        'e_masks': entity_masks
    }
    with open(args.output_pt_file, 'wb') as f:
        pickle.dump(obj, f)
Exemple #53
0
all_words = data["all_words"]
tags = data["tags"]
model_state = data["model_state"]

model = NeuralNet(input_size, hidden_size, output_size).to(device)
model.load_state_dict(model_state)
model.eval()

bot_name = "Bryant's Coffee shop"
print('Type quit to exit')

while True:
    sentence = input("You: ")
    if sentence == "quit":
        break
    sentence = tokenize(sentence)
    X = bag_of_words(sentence, all_words)
    X = X.reshape(-1, X.shape[0])
    X = torch.from_numpy(X)

    output = model(X)
    # print(output)
    _, predicted = torch.max(output, dim=1)
    tag = tags[predicted.item()]
    probs = torch.softmax(output, dim=1)
    prob = probs[0][predicted.item()]

    if prob.item() > 0.75:
        for intent in intents["intents"]:
            if tag == intent["tag"]:
                print(f"{bot_name}: {random.choice(intent['responses'])}")
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
import tensorflow as tf
from model import NerModel
from utils import tokenize, read_vocab, format_result, build_embedding_matrix
import tensorflow_addons as tf_ad
from args_help import args
import json
import numpy as np

# 针对测试集完成词表字典,标签字典,文本序列长度和初始化词向量
vocab2id, id2vocab = read_vocab(args.vocab_file)
tag2id, id2tag = read_vocab(args.tag_file)
print(id2tag)
text_sequences, label_sequences, text_origin, label_origin = tokenize(
    args.test_path, vocab2id, tag2id)
# text_sequences 的维度是(159,110)
embedded_matrix = build_embedding_matrix(args.pretrain_embedding_vec, vocab2id)

# print('查看 text_sequences 的值和维度:')
# print(text_sequences.shape)
# print(type(text_sequences))

# 载入模型
optimizer = tf.keras.optimizers.Adam(args.lr)
model = NerModel(hidden_num=args.hidden_num,
                 vocab_size=len(vocab2id),
                 label_size=len(tag2id),
                 embedding_size=args.embedding_size,
                 embedding_matrix=embedded_matrix)
# restore model
Exemple #55
0
def main(args):
    print('Loading data')
    with open(args.input_questions_json, 'r') as f:
        questions = json.load(f)['questions']

    # Either create the vocab or load it from disk
    if args.input_vocab_json == '' or args.expand_vocab == 1:
        print('Building vocab')
        if 'answer' in questions[0]:
            answer_token_to_idx = build_vocab((q['answer'] for q in questions))
        question_token_to_idx = build_vocab((q['question'] for q in questions),
                                            min_token_count=args.unk_threshold,
                                            punct_to_keep=[';', ','],
                                            punct_to_remove=['?', '.'],
                                            add_special=True)
        all_program_strs = []
        for q in questions:
            if 'program' not in q: continue
            program_str = program_to_strs(q['program'], args.mode)[0]
            if program_str is not None:
                all_program_strs.append(program_str)
        program_token_to_idx = build_vocab(all_program_strs, add_special=True)
        vocab = {
            'question_token_to_idx': question_token_to_idx,
            'program_token_to_idx': program_token_to_idx,
            'answer_token_to_idx': answer_token_to_idx,  # no special tokens
        }

    if args.input_vocab_json != '':
        print('Loading vocab')
        if args.expand_vocab == 1:
            new_vocab = vocab
        with open(args.input_vocab_json, 'r') as f:
            vocab = json.load(f)
        if args.expand_vocab == 1:
            num_new_words = 0
            for word in new_vocab['question_token_to_idx']:
                if word not in vocab['question_token_to_idx']:
                    print('Found new word %s' % word)
                    idx = len(vocab['question_token_to_idx'])
                    vocab['question_token_to_idx'][word] = idx
                    num_new_words += 1
            print('Found %d new words' % num_new_words)

    if args.output_vocab_json != '':
        with open(args.output_vocab_json, 'w') as f:
            json.dump(vocab, f, indent=4)

    # Encode all questions and programs
    print('Encoding data')
    questions_encoded = []
    programs_encoded = []
    # value_inputs, encoded by question_token_to_idx in CLEVR
    # because all valid inputs are in question vocab
    program_inputs_encoded = []
    orig_idxs = []
    image_idxs = []
    answers = []
    for orig_idx, q in enumerate(questions):
        question = q['question']

        orig_idxs.append(orig_idx)
        image_idxs.append(q['image_index'])
        question_tokens = tokenize(question,
                                   punct_to_keep=[';', ','],
                                   punct_to_remove=['?', '.'])
        question_encoded = encode(question_tokens,
                                  vocab['question_token_to_idx'],
                                  allow_unk=args.encode_unk == 1)
        questions_encoded.append(question_encoded)

        if 'program' in q:
            program = q['program']
            program_str, input_str = program_to_strs(program, args.mode)
            program_tokens = tokenize(program_str)
            program_encoded = encode(program_tokens,
                                     vocab['program_token_to_idx'])
            programs_encoded.append(program_encoded)
            # program value_inputs
            input_tokens = tokenize(input_str)
            input_encoded = encode(input_tokens,
                                   vocab['question_token_to_idx'])
            assert len(input_encoded) == len(
                program_encoded)  # input should have the same len with func
            program_inputs_encoded.append(input_encoded)
        else:
            programs_encoded.append([-1])
            program_inputs_encoded.append([-1])

        if 'answer' in q:
            answers.append(vocab['answer_token_to_idx'][q['answer']])
        else:
            answers.append(-1)

    # Pad encoded questions and programs
    max_question_length = max(len(x) for x in questions_encoded)
    for qe in questions_encoded:
        while len(qe) < max_question_length:
            qe.append(vocab['question_token_to_idx']['<NULL>'])

    if len(programs_encoded) > 0:
        max_program_length = max(len(x) for x in programs_encoded)
        for pe in programs_encoded:
            while len(pe) < max_program_length:
                pe.append(vocab['program_token_to_idx']['<NULL>'])
        for ie in program_inputs_encoded:
            while len(ie) < max_program_length:
                ie.append(vocab['question_token_to_idx']['<NULL>'])

    questions_encoded = np.asarray(questions_encoded, dtype=np.int32)
    programs_encoded = np.asarray(programs_encoded, dtype=np.int32)
    program_inputs_encoded = np.asarray(program_inputs_encoded, dtype=np.int32)
    print(questions_encoded.shape)
    print(programs_encoded.shape)
    print(program_inputs_encoded.shape)
    print('Writing')
    obj = {
        'questions': questions_encoded,
        'image_idxs': np.asarray(image_idxs),
        'orig_idxs': np.asarray(orig_idxs),
        'programs': programs_encoded,
        'program_inputs': program_inputs_encoded,
        'answers': answers,
    }
    with open(args.output_pt_file, 'wb') as f:
        pickle.dump(obj, f)
Exemple #56
0
                        default='dictionary.pkl',
                        type=str,
                        help='path to the dictionary')

    args = parser.parse_args()

    # Turns on logging.
    import logging
    root = logging.getLogger()
    root.setLevel(logging.DEBUG)

    dictionary, rev_dict = utils.get_dictionary(args.text, args.dictionary)
    num_classes = len(dictionary)

    iterator = utils.tokenize(args.text,
                              dictionary,
                              batch_size=args.batch_size,
                              seq_len=args.seq_len)

    sess = tf.Session()
    model = SeqGAN(sess,
                   num_classes,
                   logdir=args.logdir,
                   learn_phase=args.learn_phase,
                   only_cpu=args.only_cpu)
    model.build()
    model.load(ignore_missing=True)

    for epoch in range(1, args.num_epochs + 1):
        for step in range(1, args.num_steps + 1):
            logging.info('epoch %d, step %d', epoch, step)
            model.train_batch(next(iterator))
Exemple #57
0
def add_code_into_document(document, body):
    asts, code_hints = transform_body(body)

    flag = False

    #typed_method_call = set()
    for ast in asts:
        for mc in ast["typed_method_call"]:
            if mc:
                document.add(
                    Field("typed_method_call", mc, Field.Store.YES,
                          Field.Index.ANALYZED))
                flag = True

        for e in ast["extends"]:
            if e:
                document.add(
                    Field("extends", e, Field.Store.YES, Field.Index.ANALYZED))

        for c in ast["used_classes"]:
            if c:
                document.add(
                    Field("used_classes", c, Field.Store.YES,
                          Field.Index.ANALYZED))

        for m in ast["methods"]:
            if m:
                document.add(
                    Field("methods", m, Field.Store.YES, Field.Index.ANALYZED))
                flag = True

        for m in ast["methods_called"]:
            if m:
                document.add(
                    Field("methods_called", m, Field.Store.YES,
                          Field.Index.ANALYZED))
                flag = True

        #comment
        if "comments" in ast:
            for c in ast["comments"]:
                document.add(
                    Field("comments", utils.unescape_html(c), Field.Store.NO,
                          Field.Index.ANALYZED))

        for i in ast["class_instance_creation"]:
            if i:
                document.add(
                    Field("class_instance_creation", i, Field.Store.YES,
                          Field.Index.ANALYZED))
                flag = True

        for l in ast["literals"]:
            if l:
                document.add(StringField("literals", l, Field.Store.YES))

        #finally all the splitted words
        # for s in camel_case:
        # 	document.add( Field("camel_case_words", s.lower(), Field.Store.NO, Field.Index.NOT_ANALYZED))

    hints = []
    for h in code_hints:
        for token in utils.tokenize(h):
            if 1 < len(token) < 20:
                hints.append(token)

    for hint in set(hints):
        document.add(
            Field("code_hints", hint, Field.Store.YES, Field.Index.ANALYZED))

    return flag
error_rate = 0.6
reverse = True
model_path = './models/seq2seq.h5'
hidden_size = 512
sample_mode = 'argmax'
data_path = './data'
books = [
    'nietzsche.txt', 'pride_and_prejudice.txt', 'shakespeare.txt',
    'war_and_peace.txt'
]

test_sentence = 'The rabbit-hole went straight on like a tunnel for some way, and then dipped suddenly down, so suddenly that Alice had not a moment to think about stopping herself before she found herself falling down a very deep well.'

if __name__ == '__main__':
    text = read_text(data_path, books)
    vocab = tokenize(text)
    vocab = list(filter(None, set(vocab)))
    # `maxlen` is the length of the longest word in the vocabulary
    # plus two SOS and EOS characters.
    maxlen = max([len(token) for token in vocab]) + 2
    train_encoder, train_decoder, train_target = transform(
        vocab, maxlen, error_rate=error_rate, shuffle=False)

    tokens = tokenize(test_sentence)
    tokens = list(filter(None, tokens))
    nb_tokens = len(tokens)
    misspelled_tokens, _, target_tokens = transform(tokens,
                                                    maxlen,
                                                    error_rate=error_rate,
                                                    shuffle=False)
Exemple #59
0
 def tokenize(line, lower=True, flat=False, clean=True):
     if clean: line = Vocab.clean_line(line)
     toks = U.tokenize(line, lower=lower, flat=flat)
     return toks
Exemple #60
0
    # choose the first half of files based on a deterministic random range
    robj = random.Random(12345)
    robj.shuffle(files)

    if args.command == 'train':
        fileSubset = files[:len(files) / 2]
    elif args.command == 'test':
        fileSubset = files[len(files) / 2:]
    else:
        fileSubset = files[len(files) / 2:]

    if args.command == 'train' or args.command == 'test':
        for i, name in enumerate(fileSubset):
            if i % 1000 == 0:
                print '%d files done' % i
            filesAndTokens.append((name, utils.tokenize(name)))
        print len(filesAndTokens)
        print sum([len(tokens) for name, tokens in filesAndTokens])

#    model = PositionDependentVectorModel(keywords, winSize=args.win,
#                                         wdim=args.dim, stepsize=args.lr,
#                                         reg=args.reg)
#    model = ConstantAttentionVectorModel(keywords, winSize=args.win,
#                                         wdim=args.dim, stepsize=args.lr,
#                                         reg=args.reg)
#    model = NonLinearVectorModel(keywords, winSize=args.win,
#                                 wdim=args.dim, zdim=args.zdim,
#                                 stepsize=args.lr,
#                                 reg=args.reg)
#    model = RnnDense(keywords, winSize=args.win,
#                    wdim=args.dim, zdim=args.zdim,