Ejemplo n.º 1
0
def extract_feature():
    """
    抽取语料特征
    """
    corpus = get_corpus()
    corpus.initialize()
    corpus.cal_feature()
Ejemplo n.º 2
0
def train():
    """
    语料处理及hmm模型概率计算
    """
    corpus = get_corpus()
    corpus.initialize()
    corpus.cal_state()
Ejemplo n.º 3
0
def train():
    """
    语料处理及hmm模型概率计算
    """
    corpus = get_corpus()
    corpus.initialize()
    corpus.cal_state()
Ejemplo n.º 4
0
async def publications(first: str, second: str) -> List[Dict]:
    """
    - **first**: first element or measure
    - **second**: second element or one label
    """
    index = INDEX
    return get_corpus(**locals())
Ejemplo n.º 5
0
def main():
    """set-up applcation and send love"""
    api = get_handle()
    markov = Markov(get_corpus())
    people = People(api)

    love = markov.generate_markov_text(random.randrange(15, 20))
    to_email = people.get_random()
    print to_email
    print love
    if api.send_love(to_email, love):
        return 0
    else:
        return -1
Ejemplo n.º 6
0
def compute_vectors(filename, data_pos):
    data = corpus.get_corpus(filename, data_pos)
    sentences = it.chain(sent for sent in data.loc[:]['sentence'])
    print('Computing word vectors for {}...'.format(filename))
    wvmodel = w2v.Word2Vec(
        sentences=sentences,
        sg=1,
        size=6,
        hs=1,
        min_count=1,
        workers=1,
        iter=50000,
        compute_loss=True,
        window=6,
        seed=123,
    )
    print('Saving Model...')
    wvmodel.save(MODELPATH + '/' + filename + '_w2v_6_1')
    return 0
Ejemplo n.º 7
0
def main():
    model = w2v.Word2Vec.load(MODELPATH + '/dataset7_w2v_6')
    data_corpus = corpus.get_corpus('dataset7', 0)
    # data_orig = corpus.get_datalist('jeu_hota1')
    senvectors = []
    for i in data_corpus.index:
        senvecs = []
        for x in data_corpus.loc[i]['sentence']:
            senvecs.append(model[x])
        senvecs = np.vstack(np.array(senvecs))
        senvecs = np.mean(senvecs, axis=1)
        senvectors.append(senvecs)

    senvectors = np.vstack(np.array(senvectors))

    kmeans = KMeans(n_clusters=4)
    kmeans.fit(senvectors)
    cl_kmeans = kmeans.predict(senvectors)
    with open(RESULTPATH + '/kmeans4_dataset7', 'w') as f:
        for l in cl_kmeans:
            f.write(str(l) + '\n')
Ejemplo n.º 8
0
def pre_process():
    """
    抽取语料特征
    """
    corpus = get_corpus()
    corpus.pre_process()
Ejemplo n.º 9
0
def pre_process():
    """
    抽取语料特征
    """
    corpus = get_corpus()
    corpus.pre_process()
Ejemplo n.º 10
0
 def __init__(self):
     self.corpus = get_corpus()
     self.states, self.init_p = self.get_init_state()
     self.trans_p = self.get_trans_state()
     self.vocabs, self.emit_p = self.get_emit_state()
     self.model = self.get_model()
Ejemplo n.º 11
0
 def __init__(self):
     self.corpus = get_corpus()
     self.corpus.initialize()
     self.classes = len(self.corpus._states)
     self.config = get_config()
     self.model = None
        pos_tags.add(l)
        pos_tags.update(rl)
        Rule(l, rl)
    for tag in pos_tags:
        Pattern(tag)

def align_corpus(sens):
    for i, sn in enumerate(sens.sens):
        print('---------------------')
        al = sn.sl.align_tree_to_flat(sn.tl.children)
        sn.sl.filter_align(al)
        sn.sl.suggest_rules([i], sn.tl.children)
        print(al)
        print(sn.sl)
        print(sn.tl)

if __name__ == '__main__':
    import corpus
    parser = corpus.make_corpus_argparse('generate rtx rules from skelton CFG file')
    parser.add_argument('cfg_rules', help='new-line separated CFG rules, such as generated by rtx-comp -s')
    parser.add_argument('rtx_file', help='file to write generated rules to')
    args = parser.parse_args()
    read_rules(args.cfg_rules)
    generate_rule_file(args.rtx_file)
    sens = corpus.get_corpus(args)
    for sen in sens.sens:
        print(sen.sl)
    print('+++++++++++++++++++++++++++++')
    sens.compile_and_retree(args.rtx_file)
    align_corpus(sens)
Ejemplo n.º 13
0
 def __init__(self):
     self.corpus = get_corpus()
     self.config = get_config()
     self.model = None
     self.vectorizer = None
Ejemplo n.º 14
0
def test_with_frequency():
    c = corpus.get_corpus("big-shark-little-shark")
    assert "the" in c
    assert c.get_count("the") > 0
Ejemplo n.º 15
0
 def __init__(self):
     self.corpus = get_corpus()
     self.corpus.initialize()
     self.config = get_config()
     self.model = None
Ejemplo n.º 16
0
def main():
    parser = argparse.ArgumentParser(
        description='Run pairwise coreference resolution system')
    parser.add_argument('task',
                        help='choose one task to go on: train/dev/test')
    parser.add_argument('-g', '--gold', default='gold', help='gold file name')
    parser.add_argument('-o',
                        '--output',
                        default='response',
                        help='response file name')
    parser.add_argument('-m', '--model', default='model', help='model name')
    parser.add_argument('-v', '--vec', default='vec', help='vector name')

    args = parser.parse_args()
    task = args.task
    gold = args.gold + ".txt"
    output = args.output + ".txt"
    model_name = args.model + ".pkl"
    vec_name = args.vec + ".pkl"

    path = {
        "dev": 'conll-2012/dev/english/annotations',
        "train": 'conll-2012/train/english/annotations',
        "test": 'conll-2012/test/english/annotations'
    }

    start = time.time()

    print("Parsing", task, "files...")
    X, y, corpus_data, corpus_pairs = get_corpus(path[task], train=True)\
                                      if task == "train" \
                                      else get_corpus(path[task], train=False)
    if task == "train":
        vec = DictVectorizer()
        vec.fit(X)
        print("Saving vec...")
        joblib.dump(vec, vec_name)

        print("Transforming data...")
        clf = MLPClassifier(solver='lbfgs',
                            alpha=1e-5,
                            hidden_layer_sizes=(5, 2),
                            random_state=1)
        X = vec.transform(X)

        print("Training model...")
        model = clf.fit(X, y)
        print("Saving model...")
        joblib.dump(model, model_name)

    else:
        print("Transforming data...")
        vec = joblib.load(vec_name)
        model = joblib.load(model_name)
        X = vec.transform(X)

        print("Predicting...")
        pred = model.predict(X)
        print("sk-learn Pairwise classification F1:",
              f1_score(y, pred, average='macro'))
        pred = iter(pred.tolist())
        generate_files(corpus_data, corpus_pairs, pred, gold, output)

    end = time.time()

    print("Total time used: ", (end - start))
    rule_scores = [evaluate_rule(corp, r, constituents) for r in rules]
    rule_ls = list(zip(rules, rule_scores))
    rule_ls.sort(reverse=True, key=lambda x: x[1][0] + x[1][1])
    for r, s in rule_ls:
        print('%s\t%s' % (str(r), s))
    ret = []
    while rule_ls:
        todo = []
        cur = rule_ls[0][0]
        for rl, score in rule_ls[1:]:
            if score[0] + score[1] < (len(corp.sens) / 100.0):
                break
            if not cur.overlap(rl):
                todo.append((rl, score))
        ret.append(cur)
        rule_ls = todo
    return ret


if __name__ == '__main__':
    parser = corpus.make_corpus_argparse(
        'build rules from corresponding word/phrase pairs')
    parser.add_argument('rtx_file', help='file to write generated rules to')
    args = parser.parse_args()
    corp = corpus.get_corpus(args)
    rls = []
    for i in range(10):
        rls += add_rules(corp, 'PHRASE_' + str(i))
        generate_rule_file(args.rtx_file, rls)
        corp.compile_and_retree(args.rtx_file)
Ejemplo n.º 18
0
import preprocessing
import testing
import corpus
import classifiers_ioana as c_i
import data_models

if __name__ == "__main__":
    # Gather news from websites and separate in json files
    # scraper.scrape_data()

    # Get fake news and data news corpus
    true_news_corpus, fake_news_corpus = corpus.get_corpus()
    data_models.get_corpus_word_count(true_news_corpus, fake_news_corpus)

    # Preprocess data and add to json files
    # preprocessing.preprocess_data(true_news_corpus, fake_news_corpus)

    # Get preprocessed data
    true_pre_data, fake_pre_data = preprocessing.get_preprocessed_data()
    data_models.get_processed_data_word_count(true_pre_data, fake_pre_data)

    # Merge labeled data
    merged_labeled_data = preprocessing.merge_news(true_pre_data,
                                                   fake_pre_data)

    # Get word_frequency
    word_frequency = preprocessing.get_word_frequency(merged_labeled_data)

    # Get vocabulary
    vocabulary = preprocessing.get_vocabulary(merged_labeled_data,
                                              word_frequency)
Ejemplo n.º 19
0
 def __init__(self):
     self.corpus = get_corpus()
     self.states, self.init_p = self.get_init_state()
     self.trans_p = self.get_trans_state()
     self.vocabs, self.emit_p = self.get_emit_state()
     self.model = self.get_model()
Ejemplo n.º 20
0
def _get_corpus() -> corpus.Corpus:
    selected_corpus_id = _get_settings().selected_corpus_id
    return corpus.get_corpus(selected_corpus_id)
Ejemplo n.º 21
0
def test_without_frequency():
    c = corpus.get_corpus("dolch")
    assert "the" in c
    assert c.get_count("the") is None