Example #1
0
    def test(self, path):
        corp = Corpus(path)
        bs = Bayesian()
        count = 0
        sender_bl = load_pickle('sender_bl.pickle')
        # scan email and define if msg is SPAM or HAM
        # first check if sender occurs in sender Blacklist
        # then count spamicity of the word using the Bayes approach
        for fname, body in corp.emails():
            sender = find_sender(body)
            if sender in sender_bl:
                self.tag_it(path, fname, 'SPAM')
                continue

            spamicity_list = []
            count += 1
            tokens = tokenize(body)
            # compute spamicity for each word and create list of the values
            for el in tokens:
                word_spamicity = [el, bs.word_spamicity(el)]
                spamicity_list.append(word_spamicity)
            # prepare list for Bayes
            spamicity_list = [list(i) for i in set(map(tuple, spamicity_list))]  # remove duplicates from list
            spamicity_list.sort(key=lambda x: abs(0.5 - x[1]), reverse=True)
            prediction = bs.bayes_pred(spamicity_list[:15])  # Consider only 15 'words'
            if prediction > 0.9 or sender in sender_bl:
                self.tag_it(path, fname, 'SPAM')
            else:
                self.tag_it(path, fname, 'OK')
Example #2
0
    def _process_nonexp_sense(self, articles, which):
        nonexp_feat_name = FILE_PATH + '/../tmp/nonexp.feat'
        nonexp_sense_file = codecs.open(nonexp_feat_name, 'w', 'utf-8')
        nonexpParser = NonExplicit()  # change name later
        for art in articles:
            self.generate_nonexp_relations(art)
            for rel in art.nonexp_relations:
                nonexpParser.print_features(rel, ['xxxxx'], nonexp_sense_file)
        nonexp_sense_file.close()
        nonexp_pred_name = FILE_PATH + '/../tmp/nonexp.pred'
        Corpus.test_with_opennlp(nonexp_feat_name, nonexpParser.model_file, nonexp_pred_name)
        nonexp_res = [l.strip().split()[-1] for l in codecs.open(nonexp_pred_name, 'r', 'utf-8')]

        rid = 0
        for art in articles:
            for rel in art.nonexp_relations:
                pred_sense = nonexp_res[rid]
                if pred_sense == 'EntRel':
                    r_type = 'EntRel'
                elif pred_sense == 'NoRel':
                    r_type = 'NoRel'
                else:
                    r_type = 'Implicit'
                rel.rel_type = r_type
                rel.sense = [pred_sense]
                rid += 1

        assert len(nonexp_res) == rid, 'nonexp relations size not match'
Example #3
0
    def contruir_corpus_experimento(self):
        '''Contruye el dataset'''
        c = Corpus()
        if self.tamanio == 'BI':
            busquedaInicial=leer_archivo(open(self.directorio+'bi.csv','r'), eliminar_primero=True)
            clasificados = leer_archivo(open(self.directorio+'clasificados.csv', 'r'),eliminar_primero=True)
        elif self.tamanio == 'Univ':
            busquedaInicial=leer_archivo(open(self.directorio++'dataPapers.csv','r'), eliminar_primero=True)
            clasificados = leer_archivo(open(self.directorio++'validacion.csv', 'r'),eliminar_primero=True)
        conjuntoA=leer_archivo(open(self.directorio+'a.csv','r'),eliminar_primero=True)
        conjuntoS=leer_archivo(open(self.directorio+'s.csv','r'),eliminar_primero=True)
        conjuntoJ=leer_archivo(open(self.directorio+'j.csv','r'),eliminar_primero=True)
        conjuntoO=leer_archivo(open(self.directorio+'o.csv','r'),eliminar_primero=True)

        xmls = self.obtener_xmls()

        #Archivos con los eid de los papers que van a conformar la red
        ##archivo_papers_red = dividir_archivo_fecha(open(self.directorio+'relevantes.csv'), open(self.directorio+'relevantesFecha.csv'), 2013)
        archivo_papers_red = open(self.directorio+'bi.csv')
        #Lista con los eid de los papers que van a conformar la red
        lista_papers_red = leer_archivo(archivo_papers_red, eliminar_primero=True)
        #Autores-papers de la red
        dicci_contruir_red = obtener_autores(xmls, lista_papers_red)
        #Aqué deberían estar todos los autores-papers del corpus
        dicci_todos_autores_papers = obtener_autores(xmls, leer_archivo(open(self.directorio+'bi.csv'), eliminar_primero=True))
        #c.construir_corpus(self.nombre_corpus, busquedaInicial, conjuntoA, conjuntoS, conjuntoJ, conjuntoO, clasificados,
        #                   conjuntos_red=dicci_contruir_red, diccionario_todos_autores=dicci_todos_autores_papers)
        c.construir_corpus(self.nombre_corpus, busquedaInicial, conjuntoA, conjuntoS, conjuntoJ, conjuntoO, clasificados)
Example #4
0
def main():
    logging.basicConfig(format=DefaultSetting.FORMAT_LOG, level=logging.INFO)

    start_time = datetime.now()

    input_file = 'data/content.with.categories.seg.vni'
    stopwords_file = 'data/stopwords.txt'
    num_topics = 100
    prefix_name = 'demo'
    directory = 'tmp'
    query = 'data/query.txt'

    corpus = Corpus()
    corpus.build_corpus(input_file, stopwords_file, directory, prefix_name)
    LDA = LDAModel()
    LDA.train(corpus.corpus, corpus.dictionary, num_topics, directory, prefix_name)
    LDA.show()

    docsim = DocSim()
    docsim.set_model(LDA.model)
    docsim.set_doc(corpus)
    docsim.vectorized(num_topics)
    # docsim.save(directory, prefix_name)

    print 'Training time: ', datetime.now() - start_time

    start_time = datetime.now()
    reader = codecs.open(query, 'r', 'utf8')
    documents = []
    for line in reader.readlines():
        documents.append(line.replace('\n', ''))
    docsim.query(documents, True, directory, prefix_name)
    docsim.query(documents, False, directory, prefix_name)
    print 'Query time: ', datetime.now() - start_time
Example #5
0
 def _process_parsed_conn(self, articles, which='test'):
     """
     generate explicit relation for each true discourse connective
     """
     connParser = Connective()
     conn_feat_name = FILE_PATH + '/../tmp/conn.feat'
     conn_feat_file = codecs.open(conn_feat_name, 'w', 'utf-8')
     checked_conns = []
     for art in articles:
         checked_conns.append(connParser.print_features(art, which, conn_feat_file))
     conn_feat_file.close()
     conn_pred_name = FILE_PATH + '/../tmp/conn.pred'
     Corpus.test_with_opennlp(conn_feat_name, connParser.model_file, conn_pred_name)
     conn_res = [l.strip().split()[-1] for l in codecs.open(conn_pred_name, 'r', 'utf-8')]
     assert len(checked_conns) == len(articles), 'article size not match'
     s = 0
     for art, cand_conns in zip(articles, checked_conns):
         length = len(cand_conns)
         cand_res = conn_res[s:s+length]
         s += length
         for conn, label in zip(cand_conns, cand_res):
             if label == '1':
                 rel = Relation()
                 rel.doc_id = art.id
                 rel.rel_type = 'Explicit'
                 rel.article = art
                 rel.conn_leaves = conn
                 rel.conn_addr = [n.leaf_id for n in conn]
                 art.exp_relations.append(rel)
     assert s == len(conn_res), 'conn size not match'
Example #6
0
def preprocess(filename, stopword_filename=None, extra_stopwords=None):
    """
    Preprocesses a CSV file and returns ...

    Arguments:

    filename -- name of CSV file

    Keyword arguments:

    stopword_filename -- name of file containing stopwords
    extra_stopwords -- list of additional stopwords
    """

    stopwords = create_stopword_list(stopword_filename)
    stopwords.update(create_stopword_list(extra_stopwords))

    corpus = Corpus()

    for fields in reader(open(filename), delimiter=','):
        corpus.add(fields[0], tokenize(fields[-1], stopwords))

    corpus.freeze()

    return corpus
Example #7
0
    def output_json_format(self, parse_path, rel_path):
        preds = [it.strip().split()[-1] for it in open(self.predicted_file)]
        rel_dict = Corpus.read_relations(rel_path)
        idx = 0
        for art in Corpus.read_parses(parse_path, rel_dict):
            for rel in art.relations:
                if rel.rel_type == 'Explicit':
                    continue
                pred_sense = preds[idx]
                json_dict = {}
                json_dict['DocID'] = rel.doc_id
                if pred_sense == 'EntRel':
                    r_type = 'EntRel'
                elif pred_sense == 'NoRel':
                    r_type = 'NoRel'
                else:
                    r_type = 'Implicit'

                json_dict['Type'] = r_type
                json_dict['Sense'] = [pred_sense.replace('_', ' ')]
                json_dict['Connective'] = {}
                json_dict['Connective']['TokenList'] = []
                json_dict['Arg1'] = {}
                json_dict['Arg1']['TokenList'] = []
                json_dict['Arg2'] = {}
                json_dict['Arg2']['TokenList'] = []
                print json.dumps(json_dict)
                idx += 1
Example #8
0
def test_interro4():
    print('Testing interrogation 4')
    corp = Corpus('data/test-stripped-tokenised')
    data = corp.interrogate({'n': 'any'})
    d = {'and interrogating': {'first': 0, 'second': 2},
         'concordancing and': {'first': 0, 'second': 2}}
    assert_equals(data.results.to_dict(), d)
Example #9
0
 def prepare_data(self, parse_path, rel_path, which, to_file):
     rel_dict = Corpus.read_relations(rel_path)
     for art in Corpus.read_parses(parse_path, rel_dict):
         for rel in art.relations:
             if rel.rel_type != 'Explicit':
                 continue
             rel.article = art
             rel.get_conn_leaves()
         self.print_features(art, which, to_file)
def generateData():
	rep = Representor(None, 'citybeat', 'next_week_candidate_event_25by25_merged')
	corpus = Corpus()
	corpus.buildCorpusOnDB('citybeat', 'next_week_candidate_event_25by25_merged')
	true_event_list, false_event_list = loadNextWeekData()
	EventFeatureTwitter(None).GenerateArffFileHeader()
		
	for event in true_event_list + false_event_list:
		EventFeatureTwitter(event, corpus, rep).printFeatures()
Example #11
0
def test_parse():
    import shutil
    print('Testing parser')
    unparsed = Corpus(unparsed_path)
    try:
        shutil.rmtree('data/test-parsed')
    except:
        pass
    parsed = unparsed.parse()
    assert_equals(list([i.name for i in parsed.files]), ['intro.txt.xml', 'body.txt.xml'])
 def test_bb_target_state_halfed(self):
     feature_table = FeatureTable.load(get_feature_table_fixture("a_b_and_cons_feature_table.json"))
     constraint_set = ConstraintSet.load(get_constraint_set_fixture("bb_target_constraint_set.json"),
                                         feature_table)
     target_lexicon_words = Corpus.load(get_corpus_fixture("bb_target_lexicon_halfed.txt")).get_words()
     lexicon = Lexicon(target_lexicon_words, feature_table)
     grammar = Grammar(feature_table, constraint_set, lexicon)
     corpus = Corpus.load(get_corpus_fixture("bb_corpus.txt"))
     traversable_hypothesis = TraversableGrammarHypothesis(grammar, corpus)
     self.assertEqual(traversable_hypothesis.get_energy(), 407430)
Example #13
0
def test_parse_speakseg(skipassert = False):
    print('Testing parser with speaker segmentation')
    unparsed = Corpus(unparsed_path)
    import shutil
    try:
        shutil.rmtree(parsed_path)
    except:
        pass
    parsed = unparsed.parse(speaker_segmentation = True)
    if not skipassert:
        assert_equals(list([i.name for i in parsed.files]), ['intro.txt.xml', 'body.txt.xml'])
Example #14
0
    def _get_corpus(self):
        self.training_corpus = Corpus()
        self.training_corpus.load_from_file(self.training_corpus_f)

        self.unlabeled_corpus = Corpus()
        self.unlabeled_corpus.load_from_file(self.u_corpus_f)

        self.test_corpus = Corpus()
        self.test_corpus.load_from_file(self.test_corpus_f)

        self.user_corpus = Corpus()
Example #15
0
 def scoring(self, method='zagibolov'):
     # Supply argument in Corpus to connect to databse. user, password and db.
     corpus = Corpus(password='', db='project_major')
     corpus.getTweets()
     dataset = corpus.dataSet
     preprocess = Preprocess('zagibolov', self.lexicons, self.negatives, self.stopWords)
     scoring = Scoring(method, self.lexicons, self.negatives, self.stopWords, self.seeds)
     j = 0
     for data in dataset:
         preprocess.preprocessScoring(data)
         processed = preprocess.processed_data
         
     for data in processed:
         scoring.count(data['tweet'])
 ##        print self.seeds
     preprocess.seeds = scoring.lexicon_count
     preprocess.processLexicon()
     scoring.lexicons = preprocess.lexicons
 ##        print scoring.lexicon_count
     last_score = {}
     i = 0
     for i in range(0,3):
         total = 0
         j = 0
         negative = 0
         positive = 0
         scoring.resetLexiconCount()
 ##        print self.lexicons
         for data in processed:
             if j == 50:
                 break
             j += 1
             score = scoring.score(data)
             if score != 0:
                 total += 1
                 if score < 0:
                     negative += 1
                 else:
                     positive += 1
         scoring.adjustScoring()
         if last_score == {}:
             last_score = scoring.lexicons
             this_score = last_score
         else:
             this_score = scoring.lexicons
             if this_score == last_score:
                 break
             else:
                 last_score = this_score
         print this_score
         print "Total scored: " + str(total), "Negative: ", negative, "Positive: ", positive
     print this_score
     print "Total scored: " + str(total), "Negative: ", negative, "Positive: ", positive
Example #16
0
 def test_corpusContainsOnlyEmails(self):
     """Test reading the corpus with email messages only."""
     corpus = Corpus(CORPUS_DIR)
     # Exercise the SUT
     observed = {}
     for fname, contents in corpus.emails():
         observed[fname] = contents
     # Verify the results
     self.assertEqual(len(self.expected), len(observed),
                      'The emails() method did not generate all the corpus files.')
     self.assertEqual(self.expected, observed,
                          'The read file contents are not equal to the expected contents.')
Example #17
0
 def __init__(self, path, dirname):
     import os
     from os.path import join, isfile, isdir
     self.path = join(dirname, path)
     kwargs = {'print_info': False, 'level': 'f'}
     Corpus.__init__(self, self.path, **kwargs)
     if self.path.endswith('.p'):
         self.datatype = 'tokens'
     elif self.path.endswith('.xml'):
         self.datatype = 'parse'
     else:
         self.datatype = 'plaintext'
Example #18
0
 def test_corpusContainsOnlyEmails(self):
     """Test reading the corpus with email messages only."""
     corpus = Corpus(CORPUSDIR)
     # Exercise the SUT
     nitems = 0
     for fname, contents in corpus.emails_as_string():
         nitems += 1
         # Validate the results
         self.assertEqual(self.expected[fname], contents,
                          'The read file contents are not equal to the expected contents.')
     self.assertEqual(nitems, NEMAILS,
                      'The emails_as_string() method did not return the right number of files.')
 def test_aspiration_and_lengthening_extended_augmented_target_state(self):
     configurations["DATA_ENCODING_LENGTH_MULTIPLIER"] = 100
     configurations["RESTRICTION_ON_ALPHABET"] = True
     feature_table = FeatureTable.load(
         get_feature_table_fixture("aspiration_and_lengthening_extended_augmented_feature_table.json"))
     constraint_set = ConstraintSet.load(
         get_constraint_set_fixture("aspiration_and_lengthening_augmented_target_constraint_set.json"),
         feature_table)
     target_lexicon_words = Corpus.load(get_corpus_fixture("aspiration_and_lengthening_extended_target_lexicon.txt")).get_words()
     lexicon = Lexicon(target_lexicon_words, feature_table)
     grammar = Grammar(feature_table, constraint_set, lexicon)
     corpus = Corpus.load(get_corpus_fixture("aspiration_and_lengthening_extended_corpus.txt"))
     traversable_hypothesis = TraversableGrammarHypothesis(grammar, corpus)
Example #20
0
    def test(self, test_corpus_dir):
        test_corpus = Corpus(test_corpus_dir)
        with open(os.path.join(test_corpus_dir, "!prediction.txt"), "w+") as a_file:
            for filename, body in test_corpus.emails():
                if self.bayesian_combination(body) > 0.9 or self.get_email_adress(body) in self.black_list:
                    decision = "SPAM"

                else:
                    if self.get_email_adress(body) in self.white_list:
                        decision = "OK"
                    else:
                        decision = "OK"
                a_file.write(filename + " " + decision + "\n")
Example #21
0
    def __init__(self, path, dirname, datatype):
        import os
        from os.path import join, isfile, isdir

        self.path = join(dirname, path)
        kwargs = {"print_info": False, "level": "f", "datatype": datatype}
        Corpus.__init__(self, self.path, **kwargs)
        if self.path.endswith(".p"):
            self.datatype = "tokens"
        elif self.path.endswith(".xml"):
            self.datatype = "parse"
        else:
            self.datatype = "plaintext"
Example #22
0
 def _process_parsed_argpos(self, articles, which='test'):
     argpos_feat_name = FILE_PATH + '/../tmp/argpos.feat'
     argpos_feat_file = codecs.open(argpos_feat_name, 'w', 'utf-8')
     argpos_checked = []
     argposParser = ArgPos()
     for art in articles:
         for rel in art.exp_relations:
             argpos_checked.append(argposParser.print_features(rel, which, argpos_feat_file))
     argpos_feat_file.close()
     argpos_pred_name = FILE_PATH + '/../tmp/argpos.pred'
     Corpus.test_with_opennlp(argpos_feat_name, argposParser.model_file, argpos_pred_name)
     argpos_res = [l.strip().split()[-1] for l in codecs.open(argpos_pred_name, 'r', 'utf-8')]
     return argpos_res
Example #23
0
 def test_load_and_save(self):
     """Load and save functions must be inverses."""
     filename = 'testing_file'
     self.co.save_to_file(filename)
     new_co = Corpus()
     new_co.load_from_file(filename)
     self.assertTrue(_eq_crs_matrix(new_co.instances, self.co.instances))
     for index in range(len(self.co)):
         self.assertEqual(self.co.full_targets[index],
                          new_co.full_targets[index])
         self.assertEqual(self.co.representations[index],
                          new_co.representations[index])
     self.assertIsNotNone(new_co.primary_targets)
Example #24
0
def run_simulation(configurations_tuples, simulation_number, log_file_template, feature_table_file_name, corpus_file_name, constraint_set_file_name,
                  sample_target_lexicon=None, sample_target_outputs=None, target_lexicon_indicator_function=None,
                  target_constraint_set_file_name=None, target_lexicon_file_name=None, convert_corpus_word_to_target_word_function=None,
                  initial_lexicon_file_name=None):

    for configurations_tuple in configurations_tuples:
        configurations[configurations_tuple[0]] = configurations_tuple[1]

    log_file_name = log_file_template.format(platform.node(), simulation_number)
    dirname, filename = split(abspath(__file__))
    log_file_path = join(dirname, "../logging/", log_file_name)

    # if os.path.exists(log_file_path):
    #     raise ValueError("log name already exits")

    logger = logging.getLogger()
    logger.setLevel(logging.INFO)

    file_log_formatter = logging.Formatter('%(asctime)s %(levelname)s %(name)s %(message)s', "%Y-%m-%d %H:%M:%S")
    file_log_handler = logging.FileHandler(log_file_path, mode='w')
    file_log_handler.setFormatter(file_log_formatter)
    logger.addHandler(file_log_handler)

    feature_table = FeatureTable.load(get_feature_table_fixture(feature_table_file_name))
    corpus = Corpus.load(get_corpus_fixture(corpus_file_name))
    constraint_set = ConstraintSet.load(get_constraint_set_fixture(constraint_set_file_name),
                                              feature_table)

    if initial_lexicon_file_name:
        corpus_for_lexicon = Corpus.load(get_corpus_fixture(initial_lexicon_file_name))
        lexicon = Lexicon(corpus_for_lexicon.get_words(), feature_table)
    else:
        lexicon = Lexicon(corpus.get_words(), feature_table)
    grammar = Grammar(feature_table, constraint_set, lexicon)
    data = corpus.get_words()
    traversable_hypothesis = TraversableGrammarHypothesis(grammar, data)

    keyargs_dict = {}
    
    if sample_target_lexicon and sample_target_outputs and target_lexicon_indicator_function:
        keyargs_dict["sample_target_lexicon"] = sample_target_lexicon
        keyargs_dict["sample_target_outputs"] = sample_target_outputs
        keyargs_dict["target_lexicon_indicator_function"] = target_lexicon_indicator_function

    if target_constraint_set_file_name and (target_lexicon_file_name or convert_corpus_word_to_target_word_function):
        target_energy = get_target_hypothesis_energy(feature_table, target_constraint_set_file_name, corpus,
                                                     target_lexicon_file_name, convert_corpus_word_to_target_word_function)
        keyargs_dict["target_energy"] = target_energy

    simulated_annealing = SimulatedAnnealing(traversable_hypothesis, **keyargs_dict)
    simulated_annealing.run()
Example #25
0
def test_parse():
    import shutil
    print('Testing parser')
    unparsed = Corpus(unparsed_path)
    try:
        shutil.rmtree('data/test-parsed')
    except:
        pass
    parsed = unparsed.parse()
    fnames = []
    for subc in parsed.subcorpora:
        for f in subc.files:
            fnames.append(f.name)
    assert_equals(fnames, ['intro.txt.xml', 'body.txt.xml'])
Example #26
0
    def prepare_data(self, parse_path, rel_path, which, to_file):
        rel_dict = Corpus.read_relations(rel_path)
        for art in Corpus.read_parses(parse_path, rel_dict):
            for rel in art.relations:
                if rel.rel_type != 'Explicit':
                    continue
                rel.article = art
                rel.get_conn_leaves()
                labels = {s.replace(' ','_') for s in rel.sense}
                labels = {s for s in labels if s in SENSES}
                if which == 'test':
                    labels = ['|'.join(labels)]

                self.print_features(rel, labels, which, to_file)
 def test_t_aspiration_target_state(self):
     configurations["DATA_ENCODING_LENGTH_MULTIPLIER"] = 25
     feature_table = FeatureTable.load(get_feature_table_fixture("t_aspiration_feature_table.json"))
     constraint_set = ConstraintSet.load(get_constraint_set_fixture("t_aspiration_target_constraint_set.json"),
                                         feature_table)
     target_lexicon_words = Corpus.load(get_corpus_fixture("t_aspiration_target_lexicon.txt")).get_words()
     lexicon = Lexicon(target_lexicon_words, feature_table)
     grammar = Grammar(feature_table, constraint_set, lexicon)
     corpus = Corpus.load(get_corpus_fixture("t_aspiration_corpus.txt"))
     traversable_hypothesis = TraversableGrammarHypothesis(grammar, corpus)
     configurations["RESTRICTION_ON_ALPHABET"] = True
     self.assertEqual(traversable_hypothesis.get_energy(), 167838)
     configurations["RESTRICTION_ON_ALPHABET"] = False
     self.assertEqual(traversable_hypothesis.get_energy(), 173676)
Example #28
0
def test_parse_speakseg(skipassert = False):
    print('Testing parser with speaker segmentation')
    unparsed = Corpus(unparsed_path)
    import shutil
    try:
        shutil.rmtree(parsed_path)
    except:
        pass
    parsed = unparsed.parse(speaker_segmentation = True)
    fnames = []
    for subc in parsed.subcorpora:
        for f in subc.files:
            fnames.append(f.name)
    assert_equals(fnames, ['intro.txt.xml', 'body.txt.xml'])
def process_projects(src_directory, glossary_description, glossary_file):
    corpus = Corpus(src_directory)
    corpus.process()

    reference_sources = ReferenceSources()
    reference_sources.read_sources()

    metrics = Metrics()
    metrics.create(corpus)

    # Select terms
    MAX_TERMS = 5000
    sorted_terms_by_tfxdf = sorted(metrics.tfxdf, key=metrics.tfxdf.get,
                                   reverse=True)

    # Developer report
    glossary_entries = OrderedDict()
    translations = Translations()
    selected_terms = sorted_terms_by_tfxdf[:MAX_TERMS] # Sorted by frequency

    for term in selected_terms:
        glossary_entries[term] = translations.create_for_word_sorted_by_frequency(corpus.documents, term, reference_sources)

    dev_glossary_serializer = DevGlossarySerializer()
    dev_glossary_serializer.create(u"dev-" + glossary_file + ".html",
                                   glossary_description, corpus,
                                   glossary_entries, reference_sources)

    # User report
    glossary_entries = []
    selected_terms = sorted(sorted_terms_by_tfxdf[:MAX_TERMS])  # Sorted by term

    glossary = Glossary(glossary_description)
    for term in selected_terms:
        glossary_entry = GlossaryEntry(
            term,
            translations.create_for_word_sorted_by_frequency(corpus.documents,
                                                             term,
                                                             reference_sources)
        )
        glossary.entries.append(glossary_entry)

    glossary_entries = glossary.get_dict()
    process_template('templates/userglossary-html.mustache',
                     glossary_file + ".html", glossary_entries)
    process_template('templates/userglossary-csv.mustache',
                     glossary_file + ".csv", glossary_entries)

    generate_database(glossary, glossary_file)
Example #30
0
 def test_corpusContainsAlsoSpecialFiles(self):
     """Test reading the corpus with special files."""
     # Add a special file into the corpus dir
     save_file_to_corpus_dir(
         fname=SPECIAL_FILENAME, contents='fake', dirname=CORPUS_DIR)     
     corpus = Corpus(CORPUS_DIR)
     # Exercise the SUT
     observed = {}
     for fname, contents in corpus.emails():
         observed[fname] = contents
     # Verify the results
     self.assertEqual(len(self.expected), len(observed),
                      'The emails() method did not generate all the corpus files.')
     self.assertEqual(self.expected, observed,
                          'The read file contents are not equal to the expected contents.')
 def __init__(self, dictionary=None, corpus=None, index_file=None, max_docs=None, **kwargs):
     Corpus.__init__(self, dictionary=dictionary, corpus=corpus)
     self.clip_corpus(max_docs)
     # Set up for KNN
     features = len(self.dictionary)
     self.index = AnnoyIndex(features)
     start_time = datetime.datetime.now()
     if not index_file:
         self.transform_corpus(models.TfidfModel)
         for i, vector in enumerate(self):
             self.index.add_item(i, list(sparse2full(vector, features).astype(float)))
         self.index.build(self.no_trees)
     else:
         self.index.load(index_file)
     end_time = datetime.datetime.now()
     self.train_time = end_time - start_time
     return
Example #32
0
 def _get_indices(self, sentence):
     word_list = list(self._words)
     indices = []
     for token in Corpus.tokenize(sentence):
         if token in self._words:
             index = word_list.index(token)
             indices.append(index)
     return indices
 def build(cls, files):
     corpora = []
     for file in files:
         ext = os.path.splitext(file)[1]
         corpus = Corpus(open(file, 'rb'), cls.ext_to_sentiment[ext])
         corpora.append(corpus)
     corpus_set = CorpusSet(corpora)
     return SentimentClassifier(corpus_set)
Example #34
0
def main():

    pred_with_tweets = '../data/trial.csv'  # predicted labels + tweet text
    gold = '../data/trial.labels'  # file contains gold labels
    mycorpus = Corpus(pred_with_tweets, gold)
    myscores = Scorer(mycorpus)
    myresult = Result()
    myresult.show(myscores)
Example #35
0
def process_corpus(language, source, filelist, corpus_location, verbose):
    """Create a corpus at corpus_location and run the default pipeline over it."""
    pipeline = config.DEFAULT_PIPELINE
    if language == 'cn':
        pipeline = config.DEFAULT_PIPELINE_CN
    pipeline_file = config.DEFAULT_PIPELINE_CONFIGURATION_FILE
    corpus = Corpus(language=language,
                    datasource=source,
                    source_file=filelist,
                    corpus_path=corpus_location,
                    pipeline_config=pipeline)
    rconfig = RuntimeConfig(corpus_location,
                            language,
                            source,
                            pipeline_file,
                            verbose=verbose)
    corpus.run_default_pipeline(rconfig)
Example #36
0
def main():

    QAfile = sys.argv[1]
    ReviewFile = sys.argv[2]
    minReview = int(sys.argv[3])
    V = int(sys.argv[4])
    k = int(sys.argv[5])
    numiter = int(sys.argv[6])
    Lambda = float(sys.argv[7])
    predictionsOut = sys.argv[8]
    rankingOut = sys.argv[9]
    create_corpus = sys.argv[10]  # takes zero and 1 as args
    corpus_pickle_file = "./Data/corpus_{}.pkl".format(
        QAfile.split("/")[-1].split(".")[0])

    if x:
        corpus = Corpus(QAfile, ReviewFile, minReview, V)
        corpus.construct_QAnswersAndQPerItem()
        corpus.construct_SentencesAndSPerItem()
        corpus.Calculate_PairWiseFeature()
        with open(corpus_pickle_file, 'wb') as f:
            pickle.dump(corpus, f)
    else:
        with open(corpus_pickle_file, 'rb') as f:
            corpus = pickle.load(f)

    print("corpus is available")
    print(("Vocabulary Size: " + str(corpus.Map.V)))
    print(("Number of Questions: " + str(len(corpus.QAnswers))))
    print(("Number of Reviews: " + str(len(corpus.Sentences))))
    print(("Number of Items " + str(len(corpus.Map.ItemIDMap))))
    print(("Avg review length " +
           str(sum(corpus.Avgdl.values()) / len(corpus.Avgdl))))
Example #37
0
 def train(self,
           file_path,
           batch_size=10,
           learning_rate=0.1,
           lr_decay=0.05,
           epochs=1000,
           momentum=0.0):
     corpus = Corpus(file_path)
     truth_dict = utils.read_classification_from_file(file_path +
                                                      "/!truth.txt")
     got_data = True
     mails_getter = corpus.emails()
     batches = []
     # loads all data from directory in batches of given size
     while got_data:
         batch = []
         # loads a batch of given size, a smaller one if out of data
         for i in range(batch_size):
             try:
                 email = next(mails_getter)
                 batch.append(
                     (email[1],
                      1 if truth_dict[email[0]] == self.pos_tag else 0))
             except StopIteration:
                 got_data = False
                 break
         batches.append(batch)
     for e in range(epochs):  # trains multiple times on all batches
         self.init_momentums()
         for batch in batches:  # performs gradient descent on each bach
             # gets feature vectors for batch
             feature_vectors = [
                 (m[0].get_feature_vector_plr()) for m in batch
             ]  # gets feature vectors of the batch
             y = [m[1] for m in batch]  # gets the truth vector of the batch
             for i in range(
                     self.subvector_count
             ):  # weights for each subvector are trained separately
                 subvector_batch = [
                     v[i] for v in feature_vectors
                 ]  # isolates a subvector from all vectors
                 self.gradient_descent(i, y, subvector_batch, learning_rate,
                                       momentum)
             print(f"trained on epoch #{e +1}")
         learning_rate *= 1 / (1 + lr_decay * e)
Example #38
0
 def _process_parsed_argpos(self, articles, which='test'):
     argpos_feat_name = FILE_PATH + '/../tmp/argpos.feat'
     argpos_feat_file = codecs.open(argpos_feat_name, 'w', 'utf-8')
     argpos_checked = []
     argposParser = ArgPos()
     for art in articles:
         for rel in art.exp_relations:
             argpos_checked.append(
                 argposParser.print_features(rel, which, argpos_feat_file))
     argpos_feat_file.close()
     argpos_pred_name = FILE_PATH + '/../tmp/argpos.pred'
     Corpus.test_with_opennlp(argpos_feat_name, argposParser.model_file,
                              argpos_pred_name)
     argpos_res = [
         l.strip().split()[-1]
         for l in codecs.open(argpos_pred_name, 'r', 'utf-8')
     ]
     return argpos_res
Example #39
0
def main():
    nnlm = NNLM()
    corpus = Corpus.read_corpus(sys.argv[1])
    corpus.filter_freq(10000)
    nnlm.add_corpus(corpus)
    nnlm.create_model()
    nnlm.create_algorithm()
    nnlm.create_training_problem(sys.argv[2])
    nnlm.trainer.main_loop()
Example #40
0
 def __init__(self, frontier):
     self.frontier = frontier
     self.corpus = Corpus()
     
     self.subdomains = defaultdict(int)
     self.most_valid_links = (None,-1)
     self.downloaded_urls = set()
     self.traps = set()
     
     # list of all links ever added to the frontier, w/o scheme
     self.front = []
     # set of links that have already been parsed
     self.dup = set()
     
     # for comparing url similarities
     self.compare_url = None
     self.similar_url_count = 0
     self.compare_traps = set()
Example #41
0
def main():
    parser = argparse.ArgumentParser('LM main')
    parser.add_argument('--corpus',
                        type=str,
                        default='tiny_corpus.txt',
                        help='corpus to train')
    parser.add_argument('--batch_size', type=int, default=128)
    parser.add_argument('--emb_dim', type=int, default=128)
    parser.add_argument('--num_layers', type=int, default=1)
    parser.add_argument('--drop', type=float, default=0.1)
    parser.add_argument('--num_workers', type=int, default=4)
    parser.add_argument('--lr', type=float, default=0.1)
    parser.add_argument('--momentum', type=float, default=.99)
    parser.add_argument('--clip_norm', type=float, default=5)
    parser.add_argument('--epochs', type=int, default=100)
    parser.add_argument('--device', type=str, default='cpu')
    parser.add_argument('--save', type=str, default='model.pt')
    parser.add_argument('--load', type=str, default=None)
    parser.add_argument('--arpa', type=str, default='tiny_corpus.arpa')
    args = parser.parse_args()

    corpus = Corpus(args.corpus)
    loader = CorpusLoader(corpus, args.batch_size, True, args.num_workers)
    if args.load is None:
        extractor = EmbeddingExtractor(corpus.vocab, args.emb_dim, args.device)
        network = Network(extractor, args.num_layers,
                          drop=args.drop).to(args.device)
    else:
        network = torch.load(args.load, map_location=args.device)
        network.extractor.device = args.device
        network.rnn.flatten_parameters()

    ken_lm = kenlm.LanguageModel(args.arpa)
    optimizer = torch.optim.SGD(network.parameters(), args.lr, args.momentum)
    loss_fn = torch.nn.CrossEntropyLoss(reduction='none')
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer,
                                                           patience=10,
                                                           verbose=True,
                                                           factor=.5)

    min_loss = float('inf')
    for epoch in range(args.epochs):
        pred = generate(network, device=args.device)
        gen_sentence = ' '.join(pred)
        ppl = ken_lm.perplexity(gen_sentence)
        print('%s\nPPL:\t%f' % (gen_sentence, ppl))

        loss = single_epoch(network, loader, optimizer, loss_fn,
                            args.clip_norm)
        print('epochs %d \t loss %.3f' % (epoch, loss))
        scheduler.step(loss)

        if min_loss > loss:
            min_loss = loss
            print('saving to %s' % args.save)
            torch.save(network, args.save)
        print()
Example #42
0
 def channels_from_paths(self, paths, channels=[]):
     for path in paths:
         name = path.split('/')[1:]
         if not self.name_in_channels(name, channels):
             source_text = file(path).read()
             corpus = Corpus(source_text, name)
             channels.append(
                 Channel(self, self.textbox, corpus, len(channels)))
     return channels
Example #43
0
def get_answer_candidates(corpus_dict: TYPE_CORPUS_DICT):
    ac = AnswerCandidate()
    for name, path in corpus_dict.items():
        corpus = ac.add_candidates_to_corpus(Corpus(path, name))
        corpus.save('data/candidate_answer_{}.pkl'.format(name))
        corpus_dict[name] = corpus
    ac.cache_similarity_query()
    # ac.cache_relation_paths()
    return corpus_dict
Example #44
0
def analysis(corpus):
    dictionary = Dictionary()
    dictionary.load_dictionary()
    pairs = dictionary.get_pairs()

    diacritics, no_diacritics = Corpus().get_dictionaries_frequencies_and_sentences(corpus, pairs)
    update_pairs(pairs, diacritics, no_diacritics)

    diacritics_corpus = 0
    position = 0
    not_found_in_corpus_pos = {}
    with open('diacritics.csv', 'w') as writer:
        msg = f"diacritic_word\tdiacritic_pos\tdiacritic_freq\t"
        msg += f"no_diacritic_word\tno_diacritic_pos\tno_diacritic_freq\t"
        msg += f"total_freq\tcnt\n"

        writer.write(msg)
        for pair in pairs.values():
            diacritic = pair.diacritic
            no_diacritic = pair.no_diacritic

            if diacritic.frequency == 0 and no_diacritic.frequency == 0:
                logging.debug(f"Frequency 0: {diacritic.word}, {diacritic.pos}")
                pos = diacritic.pos
                if pos in not_found_in_corpus_pos:
                    counter = not_found_in_corpus_pos[pos]
                else:
                    counter = 0

                counter = counter + 1
                not_found_in_corpus_pos[pos] = counter
                continue

            total_freq = diacritic.frequency + no_diacritic.frequency

            msg = f"{diacritic.word}\t{diacritic.pos}\t{diacritic.frequency}\t"
            msg += f"{no_diacritic.word}\t{no_diacritic.pos}\t{no_diacritic.frequency}\t{total_freq}\t{position}\n"
            diacritics_corpus = diacritics_corpus + 1
            position = position + 1
            writer.write(msg)

    diacritics_dict = len(pairs)
    pdiacritics_dict = diacritics_dict * 100 / len(dictionary.words)
    pdiacritics_corpus = diacritics_corpus * 100 / diacritics_dict
    
    logging.info(f"Total unique words in dictionary: {len(dictionary.words)}")
    logging.info(f"Words with diacritic/no diacritic version {diacritics_dict} ({pdiacritics_dict:.2f}%) (in dictionary)")
    logging.info(f"Words with diacritic/no diacritic {diacritics_corpus} ({pdiacritics_corpus:.2f}%) (in corpus)")


    len_pos = total = sum(int(v) for v in not_found_in_corpus_pos.values())
    for pos in not_found_in_corpus_pos:
        counter = not_found_in_corpus_pos[pos]
        pcounter = counter * 100 / len_pos
        logging.info(f"Not found in corpus, grammar category {pos} - number: {counter} ({pcounter:.2f}%)")

    return pairs
Example #45
0
    def prepare_data(self, parse_path, rel_path, which, to_file):
        rel_dict = Corpus.read_relations(rel_path)
        articles = []
        dist = defaultdict(int)
        for art in Corpus.read_parses(parse_path, rel_dict):
            articles.append(art)
            for rel in art.relations:
                rel.article = art
                rel.get_arg_leaves()
                if rel.rel_type == 'Explicit':
                    continue
                labels = {s.replace(' ', '_') for s in rel.sense}
                for l in labels:
                    dist[l] += 1
                if which == 'test':
                    labels = ['|'.join(labels)]

                self.print_features(rel, labels, to_file)

        # add NoRel relations
        for art in articles:
            for s1, s2 in zip(art.sentences[:-1], art.sentences[1:]):
                if not art.has_inter_relation(s1.id):
                    rel = Relation()
                    rel.article = art
                    rel.doc_id = art.id
                    rel.arg1s['parsed'] = [s1.tree.root
                                           ] if not s1.tree.is_null() else []
                    rel.arg1_leaves = self.remove_leading_tailing_punc(
                        s1.leaves)
                    rel.arg1_addr = [n.leaf_id for n in rel.arg1_leaves]
                    rel.arg1_sid = rel.arg1_leaves[-1].goto_tree(
                    ).sent_id if len(rel.arg1_leaves) > 0 else -1
                    rel.arg1_text = ' '.join(n.value for n in rel.arg1_leaves)

                    rel.arg2s['parsed'] = [s2.tree.root
                                           ] if not s2.tree.is_null() else []
                    rel.arg2_leaves = self.remove_leading_tailing_punc(
                        s2.leaves)
                    rel.arg2_addr = [n.leaf_id for n in rel.arg2_leaves]
                    rel.arg2_sid = rel.arg2_leaves[0].goto_tree(
                    ).sent_id if len(rel.arg2_leaves) > 0 else -1
                    rel.arg2_text = ' '.join(n.value for n in rel.arg2_leaves)
                    self.print_features(rel, ['NoRel'], to_file)
Example #46
0
 def test_corpusContainsAlsoSpecialFiles(self):
     """Test reading the corpus with special files."""
     # Add a special file into the corpus dir
     save_file_to_corpus_dir(fname=SPECIAL_FILENAME,
                             contents='fake',
                             dirname=CORPUS_DIR)
     corpus = Corpus(CORPUS_DIR)
     # Exercise the SUT
     observed = {}
     with replaced_open():
         for fname, contents in corpus.emails():
             observed[fname] = contents
     # Verify the results
     self.assertEqual(
         len(self.expected), len(observed),
         'The emails() method did not generate all the corpus files.')
     self.assertEqual(
         self.expected, observed,
         'The read file contents are not equal to the expected contents.')
Example #47
0
 def genSpecVec(self, origin_line):
     '''Calculate spec_vec for a line.
     @param
         origin_line: an origin line fetched from database, like ("asngy033", "zzy", "bhuv", ...).
     @return
         The spec_vec for the line, in the format like [0.3, 0.1, -3.2, ...].
         The dim is determined by Word2Vec's model.
     '''
     line = Corpus.addFieldForSingle(origin_line)
     return self.int_genSpecVec(line)
Example #48
0
 def test_get_doc_topic_sims(self):
     actual_output = Cp.get_doc_topic_sims(doc_embeddings, topic_embeddings)
     desired_output = {  # {doc-id: {topic/cluster-label: similarity}}
         1: np.array([0.9486832980505138, 0.9486832980505138]),
         10: np.array([0.9761870601839528, 0.6507913734559685]),
         3: np.array([0.6507913734559685, 0.9761870601839528])
     }
     actual_output = {k: list(v) for k, v in actual_output.items()}
     desired_output = {k: list(v) for k, v in desired_output.items()}
     self.assertEqual(actual_output, desired_output)
Example #49
0
    def parse(self,
              corenlppath=False,
              operations=False,
              copula_head=True,
              speaker_segmentation=False,
              memory_mb=False,
              *args,
              **kwargs):
        """
        Parse an unparsed corpus, saving to disk

        :param corenlppath: folder containing corenlp jar files
        :type corenlppath: str
                
        :param operations: which kinds of annotations to do
        :type operations: str
        
        :param speaker_segmentation: add speaker name to parser output if your corpus is script-like:
        :type speaker_segmentation: bool

        :param memory_mb: Amount of memory in MB for parser
        :type memory_mb: int

        :param copula_head: Make copula head in dependency parse
        :type copula_head: bool

        :Example:

        >>> parsed = corpus.parse(speaker_segmentation = True)
        >>> parsed
        <corpkit.corpus.Corpus instance: speeches-parsed; 9 subcorpora>


        :returns: The newly created :class:`corpkit.corpus.Corpus`
        """
        from make import make_corpus
        from corpus import Corpus
        #from process import determine_datatype
        #dtype, singlefile = determine_datatype(self.path)
        if self.datatype != 'plaintext':
            raise ValueError(
                'parse method can only be used on plaintext corpora.')
        kwargs.pop('parse', None)
        kwargs.pop('tokenise', None)
        return Corpus(
            make_corpus(self.path,
                        parse=True,
                        tokenise=False,
                        corenlppath=corenlppath,
                        operations=operations,
                        copula_head=copula_head,
                        speaker_segmentation=speaker_segmentation,
                        memory_mb=memory_mb,
                        *args,
                        **kwargs))
Example #50
0
    def train(self, file_path):
        self.content_spam_dict = {}
        self.content_ham_dict = {}
        class_dict = utils.read_classification_from_file(file_path +
                                                         '/!truth.txt')
        corpus = Corpus(file_path)
        email_generator = corpus.emails()
        content_counter_spam = Counter()
        content_counter_ham = Counter()
        content_wordcount_spam = 0
        content_wordcount_ham = 0

        spam_count = 0
        ham_count = 0
        every_word_content = set()

        for mail in email_generator:
            content_words = self.string_to_words(mail[1].content_no_html)
            content_counter = Counter(content_words)
            for word in content_words:
                every_word_content.add(word)

            if class_dict[mail[0]] == self.pos_tag:
                spam_count += 1
                content_counter_spam += content_counter
                content_wordcount_spam += len(content_words)
            else:
                ham_count += 1
                content_counter_ham += content_counter
                content_wordcount_ham += len(content_words)

        for word in every_word_content:
            content_counter_ham[word] += 1
            content_counter_spam[word] += 1
            self.content_spam_dict[
                word] = content_counter_spam[word] / content_wordcount_spam
            self.content_ham_dict[
                word] = content_counter_ham[word] / content_wordcount_ham

        self.spam_probability = spam_count / (spam_count + ham_count)
        self.ham_probability = ham_count / (spam_count + ham_count)
        self.trained = True
Example #51
0
    def _process_exp_sense(self, articles, which='test'):
        exp_feat_name = FILE_PATH + '/../tmp/exp.feat'
        expParser = Explicit()
        exp_sense_file = codecs.open(exp_feat_name, 'w', 'utf-8')
        for art in articles:
            for rel in art.exp_relations:
                expParser.print_features(rel, ['xxxxx'], which, exp_sense_file)
        exp_sense_file.close()
        exp_pred = FILE_PATH + '/../tmp/exp.pred'
        Corpus.test_with_opennlp(exp_feat_name, expParser.model_file, exp_pred)

        exp_res = [
            l.strip().split()[-1] for l in codecs.open(exp_pred, 'r', 'utf-8')
        ]
        rid = 0
        for art in articles:
            for rel in art.exp_relations:
                pred_sense = exp_res[rid]
                rel.sense = [pred_sense]
                rid += 1
Example #52
0
def generateData2(_182, sparse=False):
    #	if sparse:
    rep = Representor()
    corpus = Corpus()
    corpus.buildCorpusOnDB('citybeat', 'candidate_event_25by25_merged')

    true_event_list, false_event_list = loadUnbalancedData(_182)

    if sparse:
        word_index, word_list = getCorpusWordList(
            rep, true_event_list + false_event_list)
        EventFeatureSparse(None).GenerateArffFileHeader(word_list)
    else:
        EventFeatureTwitter(None).GenerateArffFileHeader()

    for event in true_event_list + false_event_list:
        if not sparse:
            EventFeatureTwitter(event, corpus, rep).printFeatures()
        else:
            EventFeatureSparse(event, corpus, rep).printFeatures(word_index)
Example #53
0
    def _process_exp_sense(self, articles, which='test'):
        exp_feat_name = FILE_PATH + '/../tmp/exp.feat'
        expParser = Explicit()
        exp_sense_file = open(exp_feat_name, 'w')
        for art in articles:
            for rel in art.exp_relations:
                expParser.print_features(rel, ['Conjunction'], which, exp_sense_file)
        exp_sense_file.close()
        exp_vec = FILE_PATH + '/../tmp/exp.vec'
        exp_pred = FILE_PATH + '/../tmp/exp.pred'
        # Corpus.test_with_svm(exp_feat_name, expParser.feat_map_file, exp_vec, expParser.model_file, exp_pred)
        Corpus.test_with_opennlp(exp_feat_name, expParser.model_file, exp_pred)

        exp_res = [LABEL_SENSES_MAP[l.strip().split()[-1]] for l in open(exp_pred, 'r')]
        rid = 0
        for art in articles:
            for rel in art.exp_relations:
                pred_sense = exp_res[rid]
                rel.sense = [pred_sense]
                rid += 1
def read_data(e_path, f_path):
    """
    Combine two halves of a parallel corpus into one.

    :param e_path: path to language 1 file.
    :param f_path: path to language 2 file.
    :return: a list of tuples with parallel sentences. Sentences consist of
        a list of tokens.
    """
    fe = open(e_path, "r").read()
    ff = open(f_path, "r").read()

    corpus = Corpus()

    for e, f in zip(fe.split(" \n"), ff.split(" \n")):
        e = e.split(' ')
        f = f.split(' ')
        corpus.add_foreign(set(f))
        corpus.corpus.append((Sentence(e, pad=True), Sentence(f)))
    return corpus
Example #55
0
def main():
    data = Corpus()
    dotted_path = data.data_directory
    ends = "ji.json"  # 控制对哪些后缀的数据进行处理,可选后缀:ji.json, corpus.json
    files = data.list_corpus_files(dotted_path, ends)
    for f in files:
        print(f)

    corpus = data.load_corpus(dotted_path, ends)

    pattern_answer = []

    for topic in corpus:
        for dialog in topic:
            if len(dialog) == 2:
                pattern = sent_2_pattern(dialog[0])
                pattern_answer.append([pattern, dialog[1]])

    topic_script = generate_script(pattern_answer)
    save_2_file(topic_script, topic_profile)
class TestCorpusSet(unittest.TestCase):
    def setUp(self):
        self.negative = StringIO(u'I hated that so much')
        self.negative_corpus = Corpus(self.negative, 'negative')
        self.positive = StringIO(u'loved movie!! loved')
        self.positive_corpus = Corpus(self.positive, 'positive')

    def test_trivial(self):
        """
        consumes multiple files and turns it into sparse vectors
        """
        self.assertEqual('negative', self.negative_corpus.sentiment)

    def test_tokenize1(self):
        """
        downcases all the word tokens
        """
        self.assertListEqual(['quick', 'brown', 'fox'], Corpus.tokenize('Quick Brown Fox'))

    def test_tokenize2(self):
        """
        ignores all stop symbols
        """
        self.assertListEqual(['hello'], Corpus.tokenize('"\'hello!?!?!.\'"  '))

    def test_tokenize3(self):
        """
        ignores the unicode space
        """
        self.assertListEqual(['hello', 'bob'], Corpus.tokenize(u'hello\u00A0bob'))

    def test_positive(self):
        """
        consumes a positive training set
        """
        self.assertEqual('positive', self.positive_corpus.sentiment)

    def test_words(self):
        """
        consumes a positive training set and unique set of words
        """
        self.assertEqual({'loved', 'movie'}, self.positive_corpus.get_words())

    def test_sentiment_code_1(self):
        """
        defines a sentiment_code of 1 for positive
        """
        self.assertEqual(1, Corpus(StringIO(u''), 'positive').sentiment_code)

    def test_sentiment_code_minus1(self):
        """
        defines a sentiment_code of 1 for positive
        """
        self.assertEqual(-1, Corpus(StringIO(u''), 'negative').sentiment_code)
Example #57
0
 def test_get_relevant_docs(self):
     clus_terms = {  # {cluster-label: {A set of term-ids}}
         0: {2, 4},
         1: {1, 3}
     }
     actual_output = Cp.get_relevant_docs(clus_terms, df)
     desired_output = {  # {cluster-label: {A set of doc-ids}}
         0: {6, 7, 9},
         1: {4, 8, 9}
     }
     self.assertEqual(actual_output, desired_output)
Example #58
0
def run(config, output_dir, num_rep=5, valid_split=0.2, patience=0):
    use_cuda = torch.cuda.is_available()

    mean = 0.0  ## barbara
    vocab_file = 'data/twitter_hashtag/1kthashtag.vocab'
    dataset_file = 'data/twitter_hashtag/multiple.txt'
    emb = load_glove_embedding('data/twitter_hashtag/1kthashtag.glove')

    criterion = nn.CrossEntropyLoss()

    corpus = TwitterHashtagCorpus(train_file=dataset_file,
                                  vocab_file=vocab_file)
    config.vocab_size = corpus.vocab_size
    train_corpus = Corpus()
    train_corpus.x_data = corpus.x_train[:1000]
    train_corpus.y_data = corpus.y_train[:1000]
    valid_corpus = Corpus()
    valid_corpus.x_data = corpus.x_validation[:1000]
    valid_corpus.y_data = corpus.y_validation[:1000]

    metrics = {'accuracy': skmetrics.accuracy_score}

    for rep in range(1, num_rep + 1):
        model = TextCNN(config=config, pre_trained_emb=emb)
        optimizer = optim.Adam(model.parameters(), lr=config.learning_rate)

        #train_corpus, valid_corpus = corpus.split(valid_split=valid_split)

        output_dir_rep = os.path.join(output_dir, "rep{}".format(rep))

        t = Trainer(train_corpus=train_corpus,
                    valid_corpus=valid_corpus,
                    test_corpus=None,
                    model=model,
                    config=config,
                    criterion=criterion,
                    optimizer=optimizer,
                    verbose=False,
                    output_dir=output_dir_rep,
                    train_metrics=metrics,
                    val_metrics=metrics,
                    selection_metric='accuracy',
                    use_cuda=use_cuda)
        res = t.train(tqdm_prefix="Rep {}/{}".format(rep, num_rep),
                      patience=patience,
                      init_res_dict={"rep": rep})

        pprint(res["best"])
        mean = mean + res['best']['selection_metric']
    mean = mean / num_rep
    print(mean)
Example #59
0
def main():
    args = get_args()
    corpus = Corpus("text8", args.gram_min, args.gram_max, args.part == "part")
    subword_embeddings, _ = train_fasttext(corpus,
                                           ns_num=args.ns,
                                           window_size=5,
                                           dimension=100,
                                           learning_rate=0.01,
                                           epoch=1,
                                           subsampling=True)
    find_similar_words(corpus, subword_embeddings, args.gram_min,
                       args.gram_max)
Example #60
0
    def prepare_data(self, parse_path, rel_path, which, to_file):
        count = 0
        processed = []
        rel_dict = Corpus.read_relations(rel_path)
        for art in Corpus.read_parses(parse_path, rel_dict):
            for rel in art.relations:
                if rel.rel_type != 'Explicit':
                    continue
                rel.article = art
                rel.get_conn_leaves()
                rel.get_arg_leaves()

                # add a filter function (2015/9/29)
                if which == 'train' and not self.need_extract(rel):
                    continue
                count += 1

                processed.append(self.print_features(rel, which, to_file))

        print >> logs, "processed %d instances" % count
        return processed