コード例 #1
0
ファイル: solver3.py プロジェクト: tyz910/aij-2019
 def __init__(self, seed=42):
     super(Solver, self).__init__()
     self.is_train_task = False
     self.morph = pymorphy2.MorphAnalyzer()
     self.toktok = ToktokTokenizer()
     self.seed = seed
     self.init_seed()
コード例 #2
0
 def __init__(self, seed=42):
     self.seed = seed
     self.init_seed()
     self.morph = morph
     self.toktok = ToktokTokenizer()
     self.bert = BertEmbedder()
     self.is_loaded = True
コード例 #3
0
 def __init__(self):
     self.wl = set()
     for each in brown.words():
         each = each.lower()
         if each.isalpha() and (each not in self.wl):
             self.wl.add(each)
     self.toktok = ToktokTokenizer()
コード例 #4
0
def tokenize(i_file, o_file):
    toktok = ToktokTokenizer()
    with open(i_file, 'r') as i_f, open(o_file, 'w') as o_f:
        for line in tqdm(i_f):
            line = line.rstrip('\n')
            tokens = toktok.tokenize(line)
            print(' '.join(tokens), file=o_f)
コード例 #5
0
 def __init__(self, seed=42, data_path = 'data/'):
     self.is_train_task = False
     self.morph = pymorphy2.MorphAnalyzer()
     self.toktok = ToktokTokenizer()
     self.seed = seed
     self.init_seed()
     #self.synonyms = open(os.path.join(data_path, r'synonyms.txt'), 'r', encoding='utf8').readlines()
     #self.synonyms = [re.sub('\.','', t.lower().strip('\n')).split(' ') for t in self.synonyms]
     #self.synonyms = [[t for t in l if t]  for l in self.synonyms]
     self.synonyms = open('./data/synmaster.txt', 'r', encoding='utf8').readlines()
     self.synonyms = [re.sub('\.','', t.lower().strip('\n')).split('|') for t in self.synonyms if len(t)>5]
     self.antonyms = open('./data/antonyms.txt', 'r', encoding='utf8').readlines()
     self.antonyms = [re.sub('\.','', t.lower().strip('\n')).split('|') for t in self.antonyms if len(t)>5]        
     #self.antonyms = open(os.path.join(data_path, r'antonyms.txt'), 'r', encoding='utf8').readlines()
     #self.antonyms = [t.strip(' \n').split(' - ') for t in self.antonyms]
     self.phraseology = []
     self.razgov = []
     self.musor = []
     with open('./data/word.csv', encoding='utf-8') as f:
         for line in f:
             line = line.replace("\n","").split('|')
             if line[1] == '23':
                 self.phraseology.append(line[0])
                 #self.musor.append(line[0])                      
             if line[1] == '25':              
                 self.razgov.append(line[0])
                 #self.musor.append(line[0])  
             if line[1] == '99':              
                 self.musor.append(line[0])                    
コード例 #6
0
def file_to_features(path, word_vocab, window, min_count, total_w):
    examples = []
    toktok = ToktokTokenizer()
    punckt = set(string.punctuation)
    try:
        with open(path, 'r', encoding='utf8') as f:
            for line in f:
                for sentence in sent_tokenize(line):
                    words_1 = toktok.tokenize(sentence)
                    words_2 = []
                    for i, word in enumerate(words_1):
                        word_l = word.lower()
                        if word_l not in word_vocab:
                            continue
                        if word_vocab[word_l] < min_count:
                            continue
                        if word in punckt:
                            continue
                        frequency = word_vocab[word_l] / total_w
                        number = 1 - math.sqrt(10e-5 / frequency)
                        if random.uniform(0, 1) <= number:
                            continue
                        words_2.append(word)
                    max_j = len(words_2)
                    for i, word in enumerate(words_2):
                        start = i - window if (i - window) > 0 else 0
                        to = i + window if (i + window) < max_j else max_j
                        for j in range(start, to):
                            if i == j:
                                continue
                            target = words_2[j]
                            examples.append((word, target))
    except Exception as error:
        print(error)
    return examples
コード例 #7
0
 def __init__(self, log_dir, from_log=False):
     self.log_dir = log_dir
     if checkExistenceFile(os.path.join(log_dir,
                                        "PreprocessorConfig.json")):
         self.read_config()
     self.tok = ToktokTokenizer()
     self.parsing_char_ = sha1(b"sally14").hexdigest()
     self.fitted = False
     if from_log:
         self.fitted = True
         with open(
                 os.path.join(self.log_dir, "vocabulary.json"),
                 "r",
                 encoding="utf-8",
         ) as f:
             self.vocabulary_ = json.load(f)
         with open(
                 os.path.join(self.log_dir, "WordPhrases.json"),
                 "r",
                 encoding="utf-8",
         ) as f:
             p = json.load(f)
             self.phrasewords_ = {
                 i.replace("_", self.parsing_char_): p[i]
                 for i in p.keys()
             }
コード例 #8
0
 def __init__(self, seed=42, data_path='data/'):
     self.is_train_task = False
     self.morph = pymorphy2.MorphAnalyzer()
     self.toktok = ToktokTokenizer()
     self.seed = seed
     self.init_seed()
     self.synonyms = open(os.path.join(data_path, r'synonyms.txt'),
                          'r',
                          encoding='utf8').readlines()
     self.synonyms = [
         re.sub('\.', '',
                t.lower().strip('\n')).split(' ') for t in self.synonyms
     ]
     self.synonyms = [[t for t in l if t] for l in self.synonyms]
     self.antonyms = open(os.path.join(data_path, r'antonyms.txt'),
                          'r',
                          encoding='utf8').readlines()
     self.antonyms = [t.strip(' \n').split(' - ') for t in self.antonyms]
     self.phraseology = open(os.path.join(data_path, r'phraseologs.txt'),
                             'r',
                             encoding='utf8').readlines()
     self.phraseology = [[
         l for l in self.lemmatize(l) if l not in
         ['\n', ' ', '...', '', ',', '-', '.', '?', r' (', r'/']
     ] for l in self.phraseology]
コード例 #9
0
def build_vocabs(directory_path, min_count):
    """Build the word and char counter vocabularies"""
    toktok = ToktokTokenizer()
    word_vocab = Counter()
    char_vocab = Counter()
    char_vocab.update(['{', '}'])
    filenames = os.listdir(directory_path)
    filepaths = [os.path.join(directory_path, e) for e in filenames]
    for i, filepath in enumerate(filepaths):
        if i % 100 == 0:
            print('Reading file number {}'.format(i), end="\r")
        with open(filepath, 'r', encoding='utf8') as f:
            try:
                line = f.read()
                if 'numbers_' in filepath:
                    tmp = toktok.tokenize(line.lower())
                    for i in range(min_count):
                        word_vocab.update(tmp)
                else:
                    word_vocab.update(word_tokenize(line.lower()))
                char_vocab.update(line)
            except Exception as error:
                print('Error with file: {}'.format(filepath))
                print(error)
    return word_vocab, char_vocab
コード例 #10
0
def extract_wiki_fdict():
    f_count = 0
    # for each wiki table, get header name, and corresponding content
    f = open(wiki_path, 'r')
    f_dest = open(wiki_fdict_path, 'w')
    toktok = ToktokTokenizer()
    tid = 0
    pool = mp.Pool()
    for line in f:
        tid += 1
        t = json.loads(line)
        if not check_format(t):
            continue
        try:
            # header process
            header_iter = iter(t['tableHeaders'][-1])
            header_span = []
            header_content = dict()
            header_bows = dict()
            header_idx = 0
            for each_header in header_iter:
                html_desc = each_header['tdHtmlString']
                span = int(html_desc.split('colspan="')[1].split('"')[0])
                header_span.append((each_header['text'], span))
                header_content[header_idx] = []
                header_bows[header_idx] = []
                header_idx += 1
                if span != 1:
                    for skip_num in range(span - 1):
                        next(header_iter)
            # content process
            for row in t['tableData']:
                global_col_index = 0
                header_idx = 0
                for header, span in header_span:
                    for idx in range(span):
                        if row[global_col_index]['text'] != '':
                            header_content[header_idx].append(
                                row[global_col_index]['text'])
                            header_bows[header_idx].extend(
                                toktok.tokenize(row[global_col_index]['text']))
                        global_col_index += 1
                    header_idx += 1
        except:
            continue
        #combine header and features
        cols_features = pool.map(gov_data.get_col_features,
                                 list(header_content.values()))
        all_col_features = list(
            zip([each[0] for each in header_span], cols_features))
        for i in range(len(all_col_features)):
            if all_col_features[i][1]:
                all_col_features[i][1]['content'] = header_bows[i]
        all_col_features = list(filter(lambda x: x[1], all_col_features))
        f_dest.write(json.dumps({tid: all_col_features}, cls=MyEncoder) + '\n')
        print("finishing {0}".format(f_count))
        f_count += 1
コード例 #11
0
def loss_char(sentence, position):
    toktok = ToktokTokenizer()
    if sentence[position] in " ,./;'[]\<>?:{}!@#$% ^&*()":
        return sentence
    if sentence[position] == " ":
        return sentence
    if sentence[position] in toktok.tokenize(sentence):
        return sentence
    return sentence[:position] + sentence[position + 1:]
コード例 #12
0
def RemoveWords_by_tag(text):
    remove_tag_list = ['JJ', 'JJR', 'JJS', 'RBR', 'RBS']
    token = ToktokTokenizer()
    words = token.tokenize(text)
    words_tagged = nltk.pos_tag(words)
    filtered = untag([
        w for w in words_tagged if not w[1] in remove_tag_list
    ])  # Filtre les mots qui n'appartiennt pas à la catégorie à supprimer

    return ' '.join(map(str, filtered))
コード例 #13
0
def extract_wiki_features(wiki_feature_path, wiki_bow_path):
    f_count = 0
    # for each wiki table, get header name, and corresponding content
    f = open(wiki_path, 'r')
    f_dest = open(wiki_feature_path, 'w')
    f_bow = open(wiki_bow_path, 'w')
    toktok = ToktokTokenizer()
    for line in f:
        t = json.loads(line)
        if not check_format(t):
            continue
        try:
            # header process
            header_iter = iter(t['tableHeaders'][-1])
            header_span = []
            header_content = dict()
            header_bows = dict()
            header_idx = 0
            for each_header in header_iter:
                html_desc = each_header['tdHtmlString']
                span = int(html_desc.split('colspan="')[1].split('"')[0])
                header_span.append((each_header['text'], span))
                header_content[header_idx] = []
                header_bows[header_idx] = []
                header_idx += 1
                if span != 1:
                    for skip_num in range(span - 1):
                        next(header_iter)
            # content process
            for row in t['tableData']:
                global_col_index = 0
                header_idx = 0
                for header, span in header_span:
                    for idx in range(span):
                        if row[global_col_index]['text'] != '':
                            header_content[header_idx].append(
                                row[global_col_index]['text'])
                            header_bows[header_idx].extend(
                                toktok.tokenize(row[global_col_index]['text']))
                        global_col_index += 1
                    header_idx += 1
        except:
            continue
        #combine header and features
        for col, f_dict, bows in zip([each[0] for each in header_span],
                                     map(get_col_features,
                                         header_content.values()),
                                     header_bows.values()):
            if f_dict:
                f_dict['_id'] = t['_id']
                f_dest.write(json.dumps({col: f_dict}) + '\n')
                f_bow.write(json.dumps({col: bows}) + '\n')
        print("finishing {0}".format(f_count))
        f_count += 1
コード例 #14
0
def extract_gov_fdict(all_resources,
                      fdict_path=gov_data_fdict_path,
                      tid_type='cat_id',
                      restrict_resource=False):
    #extracting features:
    #table_id;label,curated_features,content;label,curated_features...
    f = open(fdict_path, 'w')
    #all_resources = gov_data.read_resources()
    all_resources = gov_data.wrong_csv(all_resources)
    all_resources = list(filter(lambda x: x.status, all_resources))
    if restrict_resource:
        all_resources = gov_data.select_resources(all_resources,
                                                  fsize=50,
                                                  rs_ct=len(all_resources))
    pool = mp.Pool()
    total = len(all_resources)
    count = 0
    toktok = ToktokTokenizer()
    for resource in all_resources:
        print("processing {0}-th resource".format(count))
        for each_data in resource.data_files:
            try:
                if tid_type == 'cat_id':
                    tid = resource.rs_id + ':' + each_data.df_id
                elif tid_type == 'path':
                    tid = resource.path + '/' + each_data.df_id
                d_path = each_data.path + '/data.csv'
                df = pd.read_csv(d_path,
                                 delimiter=',',
                                 quotechar='"',
                                 dtype=str,
                                 na_filter=True)
                cols = df.columns
                contents = [
                    df[each_col].dropna().tolist() for each_col in cols
                ]
                print("extract content finished")
                cols_features = pool.map(gov_data.get_col_features, contents)
                all_col_features = list(zip(cols, cols_features))
                for i in range(len(all_col_features)):
                    if all_col_features[i][1]:
                        all_col_features[i][1]['content'] = toktok.tokenize(
                            ' '.join(contents[i]))
                all_col_features = list(
                    filter(lambda x: x[1], all_col_features))
                f.write(
                    json.dumps({tid: all_col_features}, cls=MyEncoder) + '\n')
            except Exception as e:
                print(e)
        count += 1
        print("finish {0} out of {1}".format(count, total))
    f.close()
    return all_resources
コード例 #15
0
ファイル: views.py プロジェクト: eltonxue/Swerve
def index(request):
    global invertedIndex
    global jsonData

    output_links = []
    searchTermsReq = request.GET.get('term', '')

    print(searchTermsReq)

    tokenizer = ToktokTokenizer()

    searchTerms = tokenizer.tokenize(searchTermsReq)

    print(searchTerms)

    response = {}

    output_data = defaultdict(int)
    output_links = []

    for token in searchTerms:
        token = token.lower()
        if invertedIndex[token]['idf'] > 0.25 and len(token) > 1:
            print('Looking through high for: ' + token)
            for docFilePath in invertedIndex[token]['high']:
                tfidf = invertedIndex[token]['high'][docFilePath]
                output_data[docFilePath] += tfidf

    if (len(output_data) < 10):
        for token in searchTerms:
            token = token.lower()
            if invertedIndex[token]['idf'] > 0.25 and len(token) > 1:
                print('Looking through low for: ' + token)
                for docFilePath in invertedIndex[token]['low']:
                    tfidf = invertedIndex[token]['low'][docFilePath]
                    output_data[docFilePath] += tfidf

    output_data = sorted(output_data.items(), key=itemgetter(1), reverse=True)

    for docFilePath, tfidf in output_data[:10]:
        output_links.append((jsonData[docFilePath], tfidf))

    output_links.sort(key=itemgetter(1), reverse=True)

    response['term'] = searchTermsReq
    response['results'] = output_links
    response['totalURLs'] = len(output_data)
    response['uniqueTokens'] = len(invertedIndex)
    response['totalDocuments'] = len(jsonData)

    return JsonResponse(response)
コード例 #16
0
 def __init__(self):
     self.morph = morph
     self.mystem = Mystem()
     self.tokenizer = ToktokTokenizer()
     self.w2v = Word2vecProcessor()
     self.synonyms = None
     self.antonyms = None
     self.phraseology = None
     self.phraseologisms = None
     self.prep_synon = None
     self.set_f = None
     self.verbs_dict = None
     self.chasti_rechi = None
     self.set_f_2 = None
def write_in_file():
    """
    
    """
    # Get the files
    list_of_files = os.listdir(TRAINING_FOLDER_PATH)
    number_of_file = len(list_of_files)

    # Initialise the lemmatizer
    lemmatizer = nltk.WordNetLemmatizer()

    # Initialise the tokenizer
    tokenizer = ToktokTokenizer()
    tokenizer.AMPERCENT = re.compile('& '), '& '
    tokenizer.TOKTOK_REGEXES = [
        (regex, sub) if sub != '&amp; ' else (re.compile('& '), '& ')
        for (regex, sub) in ToktokTokenizer.TOKTOK_REGEXES
    ]
    toktok = tokenizer.tokenize

    total_text = pd.Series([])

    # loop in the files
    for i in range(0, 11):

        file_name = list_of_files[i]
        print(i)

        # open the files
        with open(os.path.join(TRAINING_FOLDER_PATH, file_name),
                  'r',
                  encoding="utf8") as text:

            string_text = text.read()
            splitted = toktok(string_text)
            # Lemmatize
            lemmatized = [lemmatizer.lemmatize(t) for t in splitted]
            tokens = pd.Series(lemmatized)
            # Take off random punctuation
            # All the numbers under the same name
            tokens.loc[tokens.apply(lambda x: x.isnumeric())] = "NUMBER"

            total_text = total_text.append(tokens)

    # Write in a file
    txtfilename = "training_text_file/" + str(i + 1) + "yo.txt"

    with io.open(txtfilename, "w", encoding="utf-8") as f:
        for item in total_text:
            f.write("%s " % item)
コード例 #18
0
def error_generator(utterance):
    toktok = ToktokTokenizer()
    length = len(utterance)
    nb = nb_of_errors_in_utterance(length) + 1
    utterance = utterance + " "

    for i in range(nb):
        length = len(utterance) - 1
        position = np.random.choice(range(length), p=(length) * [1 / (length)])
        l = len(toktok.tokenize(utterance))
        utterance_old = utterance
        nb = np.random.randint(1, 5)
        utterance = functions[nb](utterance, position)

    return utterance
コード例 #19
0
def clean_archive_data(folder):
    toktok = ToktokTokenizer()
    if not os.path.exists(f"{folder}-cleaned"):
        os.makedirs(f"{folder}-cleaned")
    for count, file in enumerate(os.listdir(f"{folder}")):
        if count % 1000 == 0:
            print(count)
        file_data = open(f"{folder}/{file}", "r").read()
        try:
            text_newspaper = toktok.tokenize(fulltext(file_data))
            text_newspaper_cleaned = clean(" ".join(text_newspaper))
            with open(f"{folder}-cleaned/{file}", "w") as output:
                output.write(text_newspaper_cleaned)
        except: # pylint: disable=W0702
            print(f"error with {file}", file=sys.stderr)
コード例 #20
0
def preprocess(data):
    X, Y = [], []
    toktok = ToktokTokenizer()
    for index, review in data.iterrows():
        if (index+1) % 100000 == 0:
            print(index+1)
        # words = nltk.word_tokenize(review['text'])
        tokens = toktok.tokenize(review['text'].lower())
        X.append(tokens)
        # X.append(nltk.word_tokenize(review['text']))
        Y.append(int(review['stars'] - 1))
        # if len(Y) == 10000:
        #     break
    df_new = pd.DataFrame({'text': X, 'stars': Y})
    return df_new
コード例 #21
0
def load_tokenizer(lang):
    if lang == "ko":
        from konlpy.tag import Mecab
        tokenizer = Mecab()
    elif lang == "ja":
        import Mykytea
        opt = "-model jp-0.4.7-1.mod"
        tokenizer = Mykytea.Mykytea(opt)
    elif lang == "zh_cn":
        import Mykytea
        opt = "-model ctb-0.4.0-1.mod"
        tokenizer = Mykytea.Mykytea(opt)
    elif lang == "zh_tw":
        import jieba
        tokenizer = jieba
    elif lang == "vi":
        from pyvi import ViTokenizer
        tokenizer = ViTokenizer
    elif lang == "th":
        from pythainlp.tokenize import word_tokenize
        tokenizer = word_tokenize
    elif lang == "ar":
        import pyarabic.araby as araby
        tokenizer = araby
    # elif lang=="en":
    #     from nltk import word_tokenize
    #     tokenizer = word_tokenize
    else:
        from nltk.tokenize import ToktokTokenizer
        tokenizer = ToktokTokenizer()

    return tokenizer
コード例 #22
0
ファイル: nextWord.py プロジェクト: arrgee23/ml
def makeModel():
    #sentences = webtext.raw()+brown.raw()+reuters.raw()
    sentences = webtext.raw() + reuters.raw()
    # Tokenize the sentences
    try:  # Use the default NLTK tokenizer.
        from nltk import word_tokenize, sent_tokenize
        # Testing whether it works.
        # Sometimes it doesn't work on some machines because of setup issues.
        word_tokenize(
            sent_tokenize("This is a foobar sentence. Yes it is.")[0])

    except:  # Use a naive sentence tokenizer and toktok.
        import re
        from nltk.tokenize import ToktokTokenizer
        # See https://stackoverflow.com/a/25736515/610569
        sent_tokenize = lambda x: re.split(r'(?<=[^A-Z].[.?]) +(?=[A-Z])', x)
        # Use the toktok tokenizer that requires no dependencies.
        toktok = ToktokTokenizer()
        word_tokenize = word_tokenize = toktok.tokenize

    tokenized_text = [
        list(map(str.lower, word_tokenize(sent)))
        for sent in sent_tokenize(sentences)
    ]

    # Make it ready for making 3 grams
    n = 5
    train_data, padded_sents = padded_everygram_pipeline(n, tokenized_text)

    model = MLE(n)  # Lets train a 3-grams model, previously we set n=3

    model.fit(train_data, padded_sents)
    #print(model.vocab)

    return model
コード例 #23
0
 def tokenize_nltk(self, text):
     toktok = ToktokTokenizer()
     # tokens =[toktok.tokenize(sent) for sent in sent_tokenize(text)]
     tokens = nltk.word_tokenize(text)
     stems = [stemmer.stem(t) for t in tokens]
     # print("Number of tokens:", len(tokens))
     return stems
コード例 #24
0
def prepare_data(args):
    
    UNK_TOKEN = "<unk>"
    PAD_TOKEN = "<pad>"    
    SOS_TOKEN = "<s>"
    EOS_TOKEN = "</s>"
    
    if args.tokenize:
        toktok = ToktokTokenizer()        
        SRC = data.Field(unk_token=UNK_TOKEN, pad_token=PAD_TOKEN, init_token=None, eos_token=EOS_TOKEN, 
                         lower=args.lower, tokenize=toktok.tokenize)
        TGT = data.Field(unk_token=UNK_TOKEN, pad_token=PAD_TOKEN, init_token=SOS_TOKEN, eos_token=EOS_TOKEN, 
                        lower=args.lower, tokenize=toktok.tokenize)
    else:
        
        SRC = data.Field(unk_token=UNK_TOKEN, pad_token=PAD_TOKEN, init_token=None, eos_token=EOS_TOKEN, lower=args.lower)
        TGT = data.Field(unk_token=UNK_TOKEN, pad_token=PAD_TOKEN, init_token=SOS_TOKEN, eos_token=EOS_TOKEN, 
                        lower=args.lower)

    MAX_LEN = args.max_lenght
    if args.iwslt:
        datatype = IWSLT14
    else:
        datatype = datasets.Multi30k
    train_data, val_data, test_data = datatype.splits(
        exts=('.de', '.en'), fields=(SRC, TGT),
        filter_pred=lambda x: len(vars(x)['src']) <= MAX_LEN and
                              len(vars(x)['trg']) <= MAX_LEN
    )
    MIN_FREQ = args.min_freq
    SRC.build_vocab(train_data.src, min_freq=MIN_FREQ)
    TGT.build_vocab(train_data.trg, min_freq=MIN_FREQ)
    return train_data, val_data, test_data, SRC, TGT
コード例 #25
0
def build_vocabs(filepath, min_count):
    """Build the word and char counter vocabularies"""
    toktok = ToktokTokenizer()
    word_vocab = Counter()
    char_vocab = Counter()
    with open(filepath, 'r', encoding='utf8') as f:
        try:
            line = f.read()
            if 'numbers_' in filepath:
                tmp = toktok.tokenize(line.lower())
                for i in range(min_count):
                    word_vocab.update(tmp)
            else:
                word_vocab.update(word_tokenize(line.lower()))
            char_vocab.update(line)
        except Exception as error:
            print('Error with file: {}'.format(filepath))
            print(error)
    return word_vocab, char_vocab
コード例 #26
0
 def tokenize_nltk(self, text):
     self.tokenizer_counter += 1
     logger.debug("item:" + str(self.tokenizer_counter) +
                  "/" + str(self.tokenizer_len))
     toktok = ToktokTokenizer()
     # tokens =[toktok.tokenize(sent) for sent in sent_tokenize(text)]
     tokens = nltk.word_tokenize(text)
     # logger.debug("Number of tokens:" + str(len(tokens)))
     stems = [self.stemmer.stem(t) for t in tokens]
     return stems
コード例 #27
0
ファイル: natural_language.py プロジェクト: Sandvich/Hashtags
def word_frequencies(contents):
    toktok = ToktokTokenizer()
    string_corpus = brown.raw()

    # Frequencies for each file
    list = []
    for file in contents.keys():
        print("Tokenising", file)
        tokenised = [
            toktok.tokenize(sent) for sent in sent_tokenize(string_corpus)
        ]
        fdist = Counter(chain(*tokenised))
        list.append(fdist)

    # Combine keys into one set, eliminating duplicates
    print("Making frequency distribution of all words that we care about.")
    keys = []
    for sublist in list:
        keys += sublist
    keys = set(keys)

    # Build combined frequency dict
    # Tuple of identifiers for connectives and other common words
    unwanted = ('at', 'to', 'in', 'ma', 'bez', 'ppss', 'pp$', 'dt', 'bedz',
                'hv', 'cc', 'cs', 'hvd', 'wdt', '*', 'bed', 'ber', 'be', 'np$',
                'ppo', 'pps', 'abn', 'cd', 'md', 'ben', 'ben', 'wps', 'vbd',
                'jj', 'rb', 'do', 'ql', 'dts', 'rp', 'in-tl', 'ex', 'i', 'dti',
                'dod', 'wrb', 'hvz', 'nn$')
    # This is far from the best way to do this, but I couldn't find the documentation for these identifiers
    frequencies = {}
    for key in keys:
        total = 0
        if (key[0] not in string.punctuation) and (
                key.split('/')[-1]
                not in unwanted):  # Gets rid of unwanted tokens
            for sublist in list:
                if key in sublist.keys():
                    total += sublist[key]
            frequencies[key.split('/')[0].lower()] = total
    print("Total words (that we care about): " + str(len(frequencies.keys())))

    return frequencies
コード例 #28
0
    def __init__(self, doc_map):
        self.posting_list = {}
        self.mine = ['br','\'','http','url','web','www','blp','ref','external','links']
        self.stop_words = set(stopwords.words('english')).union(self.mine)
        # self.ps = PorterStemmer().stem
        self.ps = SnowballStemmer("english").stem

        self.tokenizer = RegexpTokenizer(r'[a-zA-Z]+|[0-9]{,4}')
        self.d = doc_map
        self.sent = nltk.data.load('tokenizers/punkt/english.pickle').tokenize         
        self.toktok = ToktokTokenizer()
コード例 #29
0
 def __init__(self, doc_map):
     self.posting_list = {}
     self.mine = [
         'br', '\'', 'http', 'url', 'web', 'www', 'blp', 'ref', 'external',
         'links'
     ]
     self.stop_words = set(stopwords.words('english')).union(self.mine)
     self.ps = PorterStemmer().stem
     self.tokenizer = RegexpTokenizer(r'[a-zA-Z0-9]+')
     self.d = doc_map
     self.t = 0
     self.toktok = ToktokTokenizer()
コード例 #30
0
 def __init__(self, stopwords: Set[str]) -> None:
     self.stopwords = stopwords
     self.ps = WordNetLemmatizer()
     self.stemmer = SnowballStemmer("english")
     self.tokenizer = ToktokTokenizer()
     self.puncuation = set(string.punctuation)
     # self.words = set(nltk.corpus.words.words())
     self.pipeline = [
         self.remove_punctuation,
         self.tokenize,
         self.lowering,
         self.remove_words,
         self.remove_stopwords,
         self.remove_digits_and_punctuation,
         self.remove_dangling_puncuation,
         self.remove_single,
         self.stemm,
         self.remove_starting_with_file,
     ]
     self.words_to_remove = set(
         "edit wookieepedia format registerr wrapup wiki sandbox click edit page link code preview button format"
         .split(" "))