Exemple #1
0
 def __init__(self, cfg):
     self.cfg = cfg
     self.out_fn = self.cfg.get("machine", "ext_definitions")
     ensure_dir(os.path.dirname(self.out_fn))
     dep_map_fn = cfg.get("deps", "dep_map")
     self.read_dep_map(dep_map_fn)
     self.lemmatizer = Lemmatizer(cfg)
Exemple #2
0
def prepare_articles(articles, from_cache=False):
    texts = []
    lemmatizer = Lemmatizer()
    german_stop_words = stopwords.words('german')
    filename = "data/lda-trainingdata.pickle"
    if from_cache:
        with open(filename, 'rb') as file:
            texts = pickle.load(file)
            return texts
    else:
        # Remove '... [+ xxx chars]' pattern from 'content'
        for article in progressbar(articles):
            article_text = ""
            for text in [article.description, article.title, article.fulltext if article.fulltext else article.content]:
                if text:
                    text = re.sub('\[.*?\]', '', text)
                    text = " ".join([x for x in text.split() if x.isalnum() or '.' in x])
                    article_text += lemmatizer.lemmatize_text(text=text, verbose=False)

            article_text = [x for x in article_text.split() if x not in german_stop_words]
            texts.append(article_text)

        # Cache lda-trainingdata
        if not os.path.exists("data"):
            os.makedirs("data")
        with open(filename, 'wb') as file:
            pickle.dump(texts, file)

    return texts
 def preprocessing(self, text, lang):
     '''
     melakukan tokenisasi text menjadi beberapa kata dan kalimat
     '''
     self.stop_words = stopwords.words(lang) + list(punctuation)
     if lang == 'indonesian':
         self.lmm = Lemmatizer()
     elif lang == 'english':
         self.lmm = WordNetLemmatizer()
     self.tokenized_sent = list(set(sent_tokenize(text)))
Exemple #4
0
    def __init__(self, cfg):
        try:
            self.batch = cfg.getboolean('similarity_machine', 'batch')
        except NoSectionError:
            self.batch = False

        self.cfg = cfg
        self.lemmatizer = Lemmatizer(cfg)
        self.machine_wrapper = MachineWrapper(cfg)
        self.lemma_sim_cache = {}
        self.links_nodes_cache = {}
        self.stopwords = set(nltk_stopwords.words('english'))
Exemple #5
0
 def __init__(self, cfg):
     self.cfg = cfg
     self.lang = self.cfg.get("deps", "lang")
     self.out_fn = self.cfg.get("machine", "definitions_binary_out")
     ensure_dir(os.path.dirname(self.out_fn))
     self.dependency_processor = DependencyProcessor(self.cfg)
     dep_map_fn = cfg.get("deps", "dep_map")
     self.read_dep_map(dep_map_fn)
     self.undefined = set()
     self.lemmatizer = Lemmatizer(cfg)
     self.lexicon_fn = self.cfg.get("machine", "definitions_binary")
     self.lexicon = Lexicon.load_from_binary(self.lexicon_fn)
     self.word2lemma = {}
Exemple #6
0
 def __init__(self, cfg, direct_parse=False):
     self.cfg = cfg
     self.lang = self.cfg.get("deps", "lang")
     if (not direct_parse):
         self.out_fn = self.cfg.get("machine", "definitions_binary_out")
         ensure_dir(os.path.dirname(self.out_fn))
     self.dependency_processor = DependencyProcessor(self.cfg)
     dep_map_fn = cfg.get("deps", "dep_map")
     self.undefined = set()
     self.lemmatizer = Lemmatizer(cfg)
     self.lexicon_fn = self.cfg.get("machine", "definitions_binary")
     self.lexicon = Lexicon.load_from_binary(self.lexicon_fn)
     self.read_dep_map(dep_map_fn)
     self.word2lemma = {}
     self.first_only = cfg.getboolean('filter', 'first_only')
    def process(filename):
        global lemmatizer
        lemmatizer = Lemmatizer()

        raw_tweets = Tokenizer.parse(filename)

        raw_tweets, hashtags = Tokenizer.extract_hashtags(raw_tweets)

        raw_tweets, mentions = Tokenizer.extract_mentions(raw_tweets)

        raw_tweets, emojis = Tokenizer.extract_emojis(raw_tweets)

        tweets = []

        for text, hashtag, mention, emoji in zip(raw_tweets, hashtags, mentions, emojis):
            tweets.append(Tweet(Tokenizer.tokenize(
                text), hashtag, mention, emoji))

        return tweets
Exemple #8
0
    def __init__(self, cfg, cfg_section='word_sim'):
        self.batch = cfg.getboolean(cfg_section, 'batch')

        logging.warning("fourlangpath is {0}".format(
            cfg.get(cfg_section, 'fourlangpath')))
        self.cfg = cfg
        self.graph_dir = cfg.get(cfg_section, "graph_dir")
        ensure_dir(self.graph_dir)
        self.lemmatizer = Lemmatizer(cfg)
        self.lexicon_fn = self.cfg.get(cfg_section, "definitions_binary")
        self.lexicon = Lexicon.load_from_binary(self.lexicon_fn)
        self.defined_words = self.lexicon.get_words()
        self.word_sim_cache = {}
        self.lemma_sim_cache = {}
        self.links_nodes_cache = {}
        self.stopwords = set(nltk_stopwords.words('english'))
        self.sim_feats = SimFeatures(cfg, cfg_section)
        self.expand = cfg.getboolean(cfg_section, "expand")
        logging.info("expand is {0}".format(self.expand))
Exemple #9
0
    def parse_sentence(self, s):
        keywords = []

        # Lemmatize sentence and only keep verbs, nouns, dates and PTs
        l = Lemmatizer()
        lemmas = l.lemmatize(s)
        lemmas = l.filter(lemmas, ['V', 'N', 'W', 'PT'])

        # Normalize lemmas
        for l in lemmas:
            if l['tag'] == 'W':
                norm_lemma = l['lemma']
            else:
                norm_lemma = self.normalize(l['lemma'])

            if len(norm_lemma) > 0 and norm_lemma not in ignore_lemmas:
                keywords.append(norm_lemma)

        self.vprint("Keywords: ", keywords)

        return [self.crawler.getwordid(word) for word in keywords]
Exemple #10
0
from lemmatizer import Lemmatizer

lemma = Lemmatizer()

print(lemma.lemmatize('bersetubuh'), lemma.lemmatize('berdansa'),
      lemma.lemmatize('penamaan'), lemma.lemmatize('berusaha'),
      lemma.lemmatize('berdansa'), lemma.lemmatize('bolak-balik'),
      lemma.lemmatize('gemetar'), lemma.lemmatize('petanggungjawaban'),
      lemma.lemmatize('kepastian'), lemma.lemmatize('berpendidikan'),
      lemma.lemmatize('berhubungan'), lemma.lemmatize('berwawasan'),
      lemma.lemmatize('pengetahuan'), lemma.lemmatize('pengembala'),
      lemma.lemmatize('penarikan'), lemma.lemmatize('terbengkalai'),
      lemma.lemmatize('rumahku'), lemma.lemmatize('penanggulangan'),
      lemma.lemmatize('perpecahan'), lemma.lemmatize('pemalas'),
      lemma.lemmatize('tertikunganlah'), lemma.lemmatize('perdamaian'),
      lemma.lemmatize('terbirit-birit'), lemma.lemmatize('cebokan'),
      lemma.lemmatize('mengotomatisasikan'), lemma.lemmatize('menyelesaikan'),
      lemma.lemmatize('sekawasan'), lemma.lemmatize('pengertian'),
      lemma.lemmatize('ketidakpastian'))
Exemple #11
0

def indexCorpus():
    indexer = Indexer(database)
    # index normal articles
    indexer.corpus_cursor = database.fetch_data("SELECT * FROM articles")
    indexer.compute_tf()
    indexer.compute_tf_idf()
    indexer.purge()
    # index lemmatized articles
    indexer.corpus_cursor = database.fetch_data("SELECT * FROM articles_lemma")
    indexer.output_catalog = "./indexes_lemmatized/"
    indexer.compute_tf()
    indexer.compute_tf_idf()
    indexer.purge()


if __name__ == "__main__":
    if (len(sys.argv) > 2):
        _usage()

    lemmatizer = Lemmatizer()
    lemmatizer.makeDictionaryMap()

    if (len(sys.argv) == 2):
        if (sys.argv[1] == 'index'):
            database = Database()
            lemmatizeCorpus(lemmatizer)
            indexCorpus()

    app.run()
Exemple #12
0
 def __init__(self, wiki_file: str) -> None:
     self.wiki_file = wiki_file
     self.l = Lemmatizer()  # noqa
Exemple #13
0
import sys
from lemmatizer import Lemmatizer
src = sys.argv[1]
tgt = sys.argv[2]
lemm_cz = Lemmatizer(src,
                     "/home/big_maggie/usr/nmt_scripts/lgmf_%s.lex" % src,
                     "il2",
                     path="/home/big_maggie/usr/nmt_scripts/liblemm.so")
lemm_en = Lemmatizer(tgt,
                     "/home/big_maggie/usr/nmt_scripts/lgmf_%s.lex" % tgt,
                     "il2",
                     path="/home/big_maggie/usr/nmt_scripts/liblemm.so")
#TODO: pro kazdou vetu nahradit entity, ktere vytvoril tokenizator, opet puvodnimi tokeny
for line in sys.stdin:
    #line=line.decode('utf-8')
    print('\t'.join(
        (str(lemm_cz.get_lang(line, 0.5,
                              src)), str(lemm_en.get_lang(line, 0.5, tgt)))))
Exemple #14
0
def main():
    # read data from the raw data file
    file_reader = FileReader('train.csv')
    # get text from raw data
    train = file_reader.get_text()
    # get label and class from raw data
    labels, cla = file_reader.get_labels()

    # because all the basic function are implemented by ourself in this project
    # it will take a longer time to do the data preprocessing compare with nltk inbuild function
    # Therefore, we used only 10k data to test from alg here
    train_list = list(train)[:10000]
    # store data after cleaning
    print(
        'Clean the data, remove special punctuations, numbers and abbreviations....'
    )
    clean_list = list()
    cleaner = DataClean()
    for train_data in train_list:
        clean_list.append(cleaner.clean(train_data))
    print('Data clean done!')
    print('')
    tkn = Tokenizer()
    # train a random forest pos tagger classfication model
    print('Training a pos tagger classfication model....')
    pos_tagger, onehot_enc = train_pos_tag()
    print('Model training done!')
    print('')
    text_list = list()
    # split text into sents before pos_tag
    print('Start tokenizing and lemmatizing....')
    print('This step will take a few minutes')
    for clean_data in clean_list:
        sents = tkn.sent_tokenize(clean_data)
        text_list.append(sents)
    # features for pos_tag
    features = [
        'word', 'is_first_word', 'is_last_word', 'prev_word',
        'prev_word_last_1', 'prev_word_last_2', 'next_word', 'is_numeric',
        'first_1', 'first_2', 'first_3', 'first_4', 'last_1', 'last_2',
        'last_3', 'last_4', 'is_numeric', 'word_has_hyphen'
    ]
    # init Lemmatizer
    lem = Lemmatizer()
    lem_texts = list()

    # tokenize, pos_tag and lammatize sentence by sentence
    for sents in text_list:
        word_features = pd.DataFrame(get_data_label(sents, label=False))
        # some data is empty
        if not word_features.empty:
            word_encode = word_features[features].values
            word_encode = onehot_enc.transform(word_encode)
            pred_pos = pos_tagger.predict(word_encode)

            lem_text = list()
            text = word_features.word
            for index in range(len(text)):
                lem_text.append(
                    lem.lemmatize(text[index], tag_map(pred_pos[index])))

            lem_texts.append(lem_text)
        else:
            lem_texts.append([])
    print('Done!')
    print('')

    print('Start building the Vocabulary for our data....')
    voc = Vocabulary(lem_texts)
    voc.remove_stop_words()
    print('Done!')
    print('')

    print('Calculating idf....')
    print('It may take 3 minutes in this step')

    # get idf word dict from Vocabulary
    idf_reference = voc.idf()
    idf = np.zeros([len(voc)])

    for word in idf_reference:
        idf[voc.pos(word)] = idf_reference[word]
    print('idf done!')
    print('')

    # the tf-idf encode array
    data_array = np.zeros([len(lem_texts), len(voc)], dtype='int16')
    print('Calculating tf-idf....')
    for index, text in enumerate(lem_texts):

        vec = Vector(text, voc)
        data_array[index] = idf * vec.tf()
    print('Done!')
    print('')

    X, Y, test_X, test_Y = train_test_split(data_array, labels, test_size=0.5)

    # split the train set into 5 fold for Cross Validation
    # However Cross Validation is time consuming and not necessary in this project
    # We just use one val set to choose the best threshold
    k = 5
    fold_list = k_fold(X, k=k)
    one_size = len(fold_list[0])
    train_X = np.zeros([one_size * 4, test_X.shape[1]])
    train_Y = np.zeros([one_size * 4, 6], dtype='int64')

    # split train dataset and validation dataset
    for index, fold in enumerate(fold_list):
        if index != k - 1:
            train_X[index * one_size:index * one_size + one_size] = X[fold]
            train_Y[index * one_size:index * one_size + one_size] = Y[fold]
        else:
            val_X = X[fold]
            val_Y = Y[fold]

    preds = np.zeros((len(val_X), len(cla)))
    Pred_test = np.zeros((len(test_X), len(cla)))

    # We use LogisticRegression to train 6 models for each cat
    for index, cat in enumerate(cla):
        print('fit', cat)
        m, r = get_mdl(train_Y[:, index], train_X)
        preds[:, index] = m.predict_proba(val_X * r)[:, 1]
        Pred_test[:, index] = m.predict_proba(test_X * r)[:, 1]

    # searching for the best threshold
    threshold = [0.55, 0.6, 0.65, 0.7, 0.75]
    reslut_list = list()
    for t in threshold:
        sum_result = 0
        row, col = preds.shape
        pred_Y = np.zeros([row, col])
        for i in range(row):
            for j in range(col):
                if preds[i, j] >= t:
                    pred_Y[i, j] = 1
                else:
                    pred_Y[i, j] = 0

        # print out the pred result
        print(f'Validation set Accuracy (threshold={t}):')
        for index, cat in enumerate(cla):
            result = (pred_Y[:, index] == val_Y[:, index]).sum() / len(pred_Y)
            sum_result += result
            print(f'{cat} : {result}')
        print('')
        reslut_list.append(sum_result)

    # Using the best threshold pred test data set
    t = threshold[np.argmax(np.array(reslut_list))]
    print(f'The best threshold is {t}')
    row, col = Pred_test.shape
    pred_test_Y = np.zeros([row, col])
    for i in range(row):
        for j in range(col):
            if Pred_test[i, j] >= t:
                pred_test_Y[i, j] = 1
            else:
                pred_test_Y[i, j] = 0
    print('')
    print('#######################################')
    print('#######################################')
    print(f'Test set Accuracy (threshold={t}):')
    for index, cat in enumerate(cla):
        result = (pred_test_Y[:, index]
                  == test_Y[:, index]).sum() / len(pred_test_Y)
        print(f'{cat} : {result}')
Exemple #15
0
 def __init__(self, dictionary):
   self.dictionary = dictionary
   self.lemmatizer = Lemmatizer(dictionary)
   self.rules = RULES
   self.tag_query_cache = { }  # runtime use for tag query in dictionary