def __init__(self, cfg): self.cfg = cfg self.out_fn = self.cfg.get("machine", "ext_definitions") ensure_dir(os.path.dirname(self.out_fn)) dep_map_fn = cfg.get("deps", "dep_map") self.read_dep_map(dep_map_fn) self.lemmatizer = Lemmatizer(cfg)
def prepare_articles(articles, from_cache=False): texts = [] lemmatizer = Lemmatizer() german_stop_words = stopwords.words('german') filename = "data/lda-trainingdata.pickle" if from_cache: with open(filename, 'rb') as file: texts = pickle.load(file) return texts else: # Remove '... [+ xxx chars]' pattern from 'content' for article in progressbar(articles): article_text = "" for text in [article.description, article.title, article.fulltext if article.fulltext else article.content]: if text: text = re.sub('\[.*?\]', '', text) text = " ".join([x for x in text.split() if x.isalnum() or '.' in x]) article_text += lemmatizer.lemmatize_text(text=text, verbose=False) article_text = [x for x in article_text.split() if x not in german_stop_words] texts.append(article_text) # Cache lda-trainingdata if not os.path.exists("data"): os.makedirs("data") with open(filename, 'wb') as file: pickle.dump(texts, file) return texts
def preprocessing(self, text, lang): ''' melakukan tokenisasi text menjadi beberapa kata dan kalimat ''' self.stop_words = stopwords.words(lang) + list(punctuation) if lang == 'indonesian': self.lmm = Lemmatizer() elif lang == 'english': self.lmm = WordNetLemmatizer() self.tokenized_sent = list(set(sent_tokenize(text)))
def __init__(self, cfg): try: self.batch = cfg.getboolean('similarity_machine', 'batch') except NoSectionError: self.batch = False self.cfg = cfg self.lemmatizer = Lemmatizer(cfg) self.machine_wrapper = MachineWrapper(cfg) self.lemma_sim_cache = {} self.links_nodes_cache = {} self.stopwords = set(nltk_stopwords.words('english'))
def __init__(self, cfg): self.cfg = cfg self.lang = self.cfg.get("deps", "lang") self.out_fn = self.cfg.get("machine", "definitions_binary_out") ensure_dir(os.path.dirname(self.out_fn)) self.dependency_processor = DependencyProcessor(self.cfg) dep_map_fn = cfg.get("deps", "dep_map") self.read_dep_map(dep_map_fn) self.undefined = set() self.lemmatizer = Lemmatizer(cfg) self.lexicon_fn = self.cfg.get("machine", "definitions_binary") self.lexicon = Lexicon.load_from_binary(self.lexicon_fn) self.word2lemma = {}
def __init__(self, cfg, direct_parse=False): self.cfg = cfg self.lang = self.cfg.get("deps", "lang") if (not direct_parse): self.out_fn = self.cfg.get("machine", "definitions_binary_out") ensure_dir(os.path.dirname(self.out_fn)) self.dependency_processor = DependencyProcessor(self.cfg) dep_map_fn = cfg.get("deps", "dep_map") self.undefined = set() self.lemmatizer = Lemmatizer(cfg) self.lexicon_fn = self.cfg.get("machine", "definitions_binary") self.lexicon = Lexicon.load_from_binary(self.lexicon_fn) self.read_dep_map(dep_map_fn) self.word2lemma = {} self.first_only = cfg.getboolean('filter', 'first_only')
def process(filename): global lemmatizer lemmatizer = Lemmatizer() raw_tweets = Tokenizer.parse(filename) raw_tweets, hashtags = Tokenizer.extract_hashtags(raw_tweets) raw_tweets, mentions = Tokenizer.extract_mentions(raw_tweets) raw_tweets, emojis = Tokenizer.extract_emojis(raw_tweets) tweets = [] for text, hashtag, mention, emoji in zip(raw_tweets, hashtags, mentions, emojis): tweets.append(Tweet(Tokenizer.tokenize( text), hashtag, mention, emoji)) return tweets
def __init__(self, cfg, cfg_section='word_sim'): self.batch = cfg.getboolean(cfg_section, 'batch') logging.warning("fourlangpath is {0}".format( cfg.get(cfg_section, 'fourlangpath'))) self.cfg = cfg self.graph_dir = cfg.get(cfg_section, "graph_dir") ensure_dir(self.graph_dir) self.lemmatizer = Lemmatizer(cfg) self.lexicon_fn = self.cfg.get(cfg_section, "definitions_binary") self.lexicon = Lexicon.load_from_binary(self.lexicon_fn) self.defined_words = self.lexicon.get_words() self.word_sim_cache = {} self.lemma_sim_cache = {} self.links_nodes_cache = {} self.stopwords = set(nltk_stopwords.words('english')) self.sim_feats = SimFeatures(cfg, cfg_section) self.expand = cfg.getboolean(cfg_section, "expand") logging.info("expand is {0}".format(self.expand))
def parse_sentence(self, s): keywords = [] # Lemmatize sentence and only keep verbs, nouns, dates and PTs l = Lemmatizer() lemmas = l.lemmatize(s) lemmas = l.filter(lemmas, ['V', 'N', 'W', 'PT']) # Normalize lemmas for l in lemmas: if l['tag'] == 'W': norm_lemma = l['lemma'] else: norm_lemma = self.normalize(l['lemma']) if len(norm_lemma) > 0 and norm_lemma not in ignore_lemmas: keywords.append(norm_lemma) self.vprint("Keywords: ", keywords) return [self.crawler.getwordid(word) for word in keywords]
from lemmatizer import Lemmatizer lemma = Lemmatizer() print(lemma.lemmatize('bersetubuh'), lemma.lemmatize('berdansa'), lemma.lemmatize('penamaan'), lemma.lemmatize('berusaha'), lemma.lemmatize('berdansa'), lemma.lemmatize('bolak-balik'), lemma.lemmatize('gemetar'), lemma.lemmatize('petanggungjawaban'), lemma.lemmatize('kepastian'), lemma.lemmatize('berpendidikan'), lemma.lemmatize('berhubungan'), lemma.lemmatize('berwawasan'), lemma.lemmatize('pengetahuan'), lemma.lemmatize('pengembala'), lemma.lemmatize('penarikan'), lemma.lemmatize('terbengkalai'), lemma.lemmatize('rumahku'), lemma.lemmatize('penanggulangan'), lemma.lemmatize('perpecahan'), lemma.lemmatize('pemalas'), lemma.lemmatize('tertikunganlah'), lemma.lemmatize('perdamaian'), lemma.lemmatize('terbirit-birit'), lemma.lemmatize('cebokan'), lemma.lemmatize('mengotomatisasikan'), lemma.lemmatize('menyelesaikan'), lemma.lemmatize('sekawasan'), lemma.lemmatize('pengertian'), lemma.lemmatize('ketidakpastian'))
def indexCorpus(): indexer = Indexer(database) # index normal articles indexer.corpus_cursor = database.fetch_data("SELECT * FROM articles") indexer.compute_tf() indexer.compute_tf_idf() indexer.purge() # index lemmatized articles indexer.corpus_cursor = database.fetch_data("SELECT * FROM articles_lemma") indexer.output_catalog = "./indexes_lemmatized/" indexer.compute_tf() indexer.compute_tf_idf() indexer.purge() if __name__ == "__main__": if (len(sys.argv) > 2): _usage() lemmatizer = Lemmatizer() lemmatizer.makeDictionaryMap() if (len(sys.argv) == 2): if (sys.argv[1] == 'index'): database = Database() lemmatizeCorpus(lemmatizer) indexCorpus() app.run()
def __init__(self, wiki_file: str) -> None: self.wiki_file = wiki_file self.l = Lemmatizer() # noqa
import sys from lemmatizer import Lemmatizer src = sys.argv[1] tgt = sys.argv[2] lemm_cz = Lemmatizer(src, "/home/big_maggie/usr/nmt_scripts/lgmf_%s.lex" % src, "il2", path="/home/big_maggie/usr/nmt_scripts/liblemm.so") lemm_en = Lemmatizer(tgt, "/home/big_maggie/usr/nmt_scripts/lgmf_%s.lex" % tgt, "il2", path="/home/big_maggie/usr/nmt_scripts/liblemm.so") #TODO: pro kazdou vetu nahradit entity, ktere vytvoril tokenizator, opet puvodnimi tokeny for line in sys.stdin: #line=line.decode('utf-8') print('\t'.join( (str(lemm_cz.get_lang(line, 0.5, src)), str(lemm_en.get_lang(line, 0.5, tgt)))))
def main(): # read data from the raw data file file_reader = FileReader('train.csv') # get text from raw data train = file_reader.get_text() # get label and class from raw data labels, cla = file_reader.get_labels() # because all the basic function are implemented by ourself in this project # it will take a longer time to do the data preprocessing compare with nltk inbuild function # Therefore, we used only 10k data to test from alg here train_list = list(train)[:10000] # store data after cleaning print( 'Clean the data, remove special punctuations, numbers and abbreviations....' ) clean_list = list() cleaner = DataClean() for train_data in train_list: clean_list.append(cleaner.clean(train_data)) print('Data clean done!') print('') tkn = Tokenizer() # train a random forest pos tagger classfication model print('Training a pos tagger classfication model....') pos_tagger, onehot_enc = train_pos_tag() print('Model training done!') print('') text_list = list() # split text into sents before pos_tag print('Start tokenizing and lemmatizing....') print('This step will take a few minutes') for clean_data in clean_list: sents = tkn.sent_tokenize(clean_data) text_list.append(sents) # features for pos_tag features = [ 'word', 'is_first_word', 'is_last_word', 'prev_word', 'prev_word_last_1', 'prev_word_last_2', 'next_word', 'is_numeric', 'first_1', 'first_2', 'first_3', 'first_4', 'last_1', 'last_2', 'last_3', 'last_4', 'is_numeric', 'word_has_hyphen' ] # init Lemmatizer lem = Lemmatizer() lem_texts = list() # tokenize, pos_tag and lammatize sentence by sentence for sents in text_list: word_features = pd.DataFrame(get_data_label(sents, label=False)) # some data is empty if not word_features.empty: word_encode = word_features[features].values word_encode = onehot_enc.transform(word_encode) pred_pos = pos_tagger.predict(word_encode) lem_text = list() text = word_features.word for index in range(len(text)): lem_text.append( lem.lemmatize(text[index], tag_map(pred_pos[index]))) lem_texts.append(lem_text) else: lem_texts.append([]) print('Done!') print('') print('Start building the Vocabulary for our data....') voc = Vocabulary(lem_texts) voc.remove_stop_words() print('Done!') print('') print('Calculating idf....') print('It may take 3 minutes in this step') # get idf word dict from Vocabulary idf_reference = voc.idf() idf = np.zeros([len(voc)]) for word in idf_reference: idf[voc.pos(word)] = idf_reference[word] print('idf done!') print('') # the tf-idf encode array data_array = np.zeros([len(lem_texts), len(voc)], dtype='int16') print('Calculating tf-idf....') for index, text in enumerate(lem_texts): vec = Vector(text, voc) data_array[index] = idf * vec.tf() print('Done!') print('') X, Y, test_X, test_Y = train_test_split(data_array, labels, test_size=0.5) # split the train set into 5 fold for Cross Validation # However Cross Validation is time consuming and not necessary in this project # We just use one val set to choose the best threshold k = 5 fold_list = k_fold(X, k=k) one_size = len(fold_list[0]) train_X = np.zeros([one_size * 4, test_X.shape[1]]) train_Y = np.zeros([one_size * 4, 6], dtype='int64') # split train dataset and validation dataset for index, fold in enumerate(fold_list): if index != k - 1: train_X[index * one_size:index * one_size + one_size] = X[fold] train_Y[index * one_size:index * one_size + one_size] = Y[fold] else: val_X = X[fold] val_Y = Y[fold] preds = np.zeros((len(val_X), len(cla))) Pred_test = np.zeros((len(test_X), len(cla))) # We use LogisticRegression to train 6 models for each cat for index, cat in enumerate(cla): print('fit', cat) m, r = get_mdl(train_Y[:, index], train_X) preds[:, index] = m.predict_proba(val_X * r)[:, 1] Pred_test[:, index] = m.predict_proba(test_X * r)[:, 1] # searching for the best threshold threshold = [0.55, 0.6, 0.65, 0.7, 0.75] reslut_list = list() for t in threshold: sum_result = 0 row, col = preds.shape pred_Y = np.zeros([row, col]) for i in range(row): for j in range(col): if preds[i, j] >= t: pred_Y[i, j] = 1 else: pred_Y[i, j] = 0 # print out the pred result print(f'Validation set Accuracy (threshold={t}):') for index, cat in enumerate(cla): result = (pred_Y[:, index] == val_Y[:, index]).sum() / len(pred_Y) sum_result += result print(f'{cat} : {result}') print('') reslut_list.append(sum_result) # Using the best threshold pred test data set t = threshold[np.argmax(np.array(reslut_list))] print(f'The best threshold is {t}') row, col = Pred_test.shape pred_test_Y = np.zeros([row, col]) for i in range(row): for j in range(col): if Pred_test[i, j] >= t: pred_test_Y[i, j] = 1 else: pred_test_Y[i, j] = 0 print('') print('#######################################') print('#######################################') print(f'Test set Accuracy (threshold={t}):') for index, cat in enumerate(cla): result = (pred_test_Y[:, index] == test_Y[:, index]).sum() / len(pred_test_Y) print(f'{cat} : {result}')
def __init__(self, dictionary): self.dictionary = dictionary self.lemmatizer = Lemmatizer(dictionary) self.rules = RULES self.tag_query_cache = { } # runtime use for tag query in dictionary