Esempio n. 1
0
def main():
    """Trains all of the language models and tests them on the dev data. Change devPath if you
     wish to do things like test on the training data."""

    trainPath = 'data/tagged-train.dat'
    trainingCorpus = Corpus(trainPath)

    devPath = 'data/tagged-dev.dat'
    devCorpus = Corpus(devPath)

    # print 'Unigram Language Model: '
    # unigramLM = UnigramModel(trainingCorpus)
    # unigramSpell = SpellCorrect(unigramLM, trainingCorpus)
    # unigramOutcome = unigramSpell.evaluate(devCorpus)
    # print str(unigramOutcome)

    # print 'Uniform Language Model: '
    # uniformLM = UniformModel(trainingCorpus)
    # uniformSpell = SpellCorrect(uniformLM, trainingCorpus)
    # uniformOutcome = uniformSpell.evaluate(devCorpus)
    # print str(uniformOutcome)

    # print 'Smooth Unigram Language Model: '
    # smoothUnigramLM = SmoothUnigramModel(trainingCorpus)
    # smoothUnigramSpell = SpellCorrect(smoothUnigramLM, trainingCorpus)
    # smoothUnigramOutcome = smoothUnigramSpell.evaluate(devCorpus)
    # print str(smoothUnigramOutcome)

    print 'Smooth Bigram Language Model: '
    smoothBigramLM = SmoothBigramModel(trainingCorpus)
    smoothBigramSpell = SpellCorrect(smoothBigramLM, trainingCorpus)
    smoothBigramOutcome = smoothBigramSpell.evaluate(devCorpus)
    print str(smoothBigramOutcome)
Esempio n. 2
0
    def build_index(self):
        '''
        This function build the inverted index, it inserts the url to the
        doc Table with a doc_id, and insert each token to tokenT table
        and insert token, doc_id, term frequency and weight into the web_index
        Table
        '''

        c = Corpus()
        t = Tokenizer()

        for url, name in c.get_file_name():
            if len(url) > 1000:
                continue
            result = t.tokenize(name)
            if len(result) == 0:
                continue
            print(url)
            doc_id = 1

            #Insert URL to table DOC
            sql = "INSERT INTO web.doc(url) values (%s)"
            val = (url, )
            self.mycursor.execute(sql, val)
            self.mydb.commit()

            print(self.mycursor.rowcount, "was inserted in URL.")

            print(url)
            s_sql = "select id from doc where url=%s"
            self.mycursor.execute(s_sql, val)
            myresult = self.mycursor.fetchone()
            doc_id = myresult[0]
            print("DOC_ID IS " + str(doc_id))

            #Insert token, doc_id, tf into web_index
            t_sql = "INSERT INTO web.web_index(token, doc_id, tf, wt) values (%s,%s,%s,%s)"

            t_val = []
            for token in result.keys():
                t_val.append(
                    (token, doc_id, result[token][0], result[token][1]))

            #print(t_val)

            self.mycursor.executemany(t_sql, t_val)

            self.mydb.commit()
            print(self.mycursor.rowcount, "was inserted in WEB_INDEX.")

            #insert into TokenT table
            count = 0
            for token in result.keys():
                tq = "Insert ignore into tokenT values (%s)"
                tv = (token, )
                self.mycursor.execute(tq, tv)
                self.mydb.commit()
                count += 1

            print("inserted " + str(count) + " Tokens")
Esempio n. 3
0
def searchengine(directory):
    stopWords = set(stopwords.words("english"))
    # stemming
    ps = PorterStemmer()

    # create InvertedIndex obj
    invertedIndex = InvertedIndex()
    # build the corpus 
    Corp = Corpus()
    corpus = Corp.buildCorpus(directory)
    for docId in corpus: 
        doc = corpus[docId] 
        content = doc.getContent()
        # tokenize 
        tokens = word_tokenize(content)
        
        for token in tokens:
            token = token.lower()
            # apply stemming 
            token = ps.stem(token)

            # remove stopwords 
            if token in stopWords:
                continue
            # add to index 
            invertedIndex.addTerm(token, docId)
        
    return invertedIndex, corpus
Esempio n. 4
0
def main2():
    corpus = {}
    
    db = Corpus(database="sanal", collection=sys.argv[1])
    query = {}
    for i, item in enumerate(db.find(query)):

        text = item["text"]
        words = getwords(unicode(text))

        wordsd = {}
        for w in words:
            countup(wordsd, w)

        doc = { "text": wordsd, "id": item["id"] }
        #u = item["user"]["screen_name"]
        u = item["screen_name"]

        try:
            corpus[u].append(doc)
        except KeyError:
            corpus[u] = [ doc ]
    
        print(i)


    with file(sys.argv[2], "w") as opened:
        for k, v in corpus.items():
            opened.write("%s\n" % json.dumps({k: v}))
Esempio n. 5
0
def verificarPlagioTimeProfile(diretorioCorpus, diretorioDocumento, limiar):
    '''
    Input: endereço para pasta contendo arquivos, endereço para um arquivo, limiar de contenção para verificação.
    Conta o tempo de instanciação de um objeto corpus, um objeto documento, e da verificação de plágio do documento em relação ao corpus.
    '''
    c = Corpus(diretorioCorpus)
    doc = Documento(diretorioDocumento)
    return c.verificarPlagio(doc, limiar)
Esempio n. 6
0
    def __init__(self, file_text, doc_name, corpus, n_gram_length):
        Corpus.__init__(self, file_text, doc_name, n_gram_length)
        self.tf_idf = {}

        for n_gram in self.n_gram_count:
            tf = float(self.n_gram_count[n_gram])
            idf = float(self.doc_length * (corpus.n_gram_count[n_gram] + 1))
            self.tf_idf[n_gram] = log(tf / idf)
Esempio n. 7
0
def verificarPlagioMemUsageProfile(diretorioCorpus, diretorioDocumento,
                                   limiar):
    '''
    Input: endereço para pasta contendo arquivos, endereço para um arquivo, limiar de contenção para verificação.
    Calcula a memória usada pela instanciação de um objeto corpus, um objeto documento, e da verificação de plágio do documento em relação ao corpus.
    '''
    c = Corpus(diretorioCorpus)
    doc = Documento(diretorioDocumento)
    return c.verificarPlagio(doc, limiar)
 def __init__(self, verbose=False):
     print('Loading corpus ...')
     self.corpus = Corpus(verbose=verbose)
     self.corpus.create_data()
     self.X_seqs, self.y_seqs = self.corpus.X_seqs, self.corpus.y_seqs
     self.seq_count = len(self.X_seqs)
     for seq_idx in range(self.seq_count):
         assert (len(self.X_seqs[seq_idx]) == len(self.y_seqs[seq_idx]))
     self.feature_dim = len(self.X_seqs[0][0])
Esempio n. 9
0
 def train(self, numIterations=100, testCorpusPath=None):
   if testCorpusPath:
     testCorpus = Corpus(testCorpusPath)
   for i in range(1, numIterations + 1):
     self.algorithm.train() # call train method from algorithm
     if i % 10 == 0:
       # trainEval = Evaluation(self.algorithm.corpus)
       # print "Training evaluation for", i, "iteration(s):\n", trainEval.format()
       # self.algorithm.corpus.resetSentStats()
       if testCorpusPath:
         self.setPredictedTags(testCorpus) 
         testEval = Evaluation(testCorpus)
         print "Testing evaluation for", i, "iteration(s):\n",testEval.format()
         testCorpus.resetSentStats() # !!! we can use prototype pattern(so we don't need to loop through sents): here testCorpus = testCorpus.getPrototype() and in Corpus::__init__ : self.prototype = self (google : python prototype)?
Esempio n. 10
0
def main():
    corpus = Corpus()
    translator = DirectTranslator()
    modified = ModifiedTranslator()
    testCorpus = corpus.fullCorpus()
    for pair in testCorpus:
        spanishSentence = pair[0]
        realEnglishSentence = pair[1]
        modifiedTranslatedSentence = modified.translateSentence(spanishSentence)
        translatedSentence = translator.translateSentence(spanishSentence)
        print "*** Translation ****"
        print "Spanish: " + spanishSentence
        print "Direct Translation: " + translatedSentence
        print "Modified Translation: " + modifiedTranslatedSentence
        print "Human Translation: " + realEnglishSentence
Esempio n. 11
0
 def setup_corpus(self, theme, nb_docs):
     #contient les mots filtrés
     self.WORDS = theme + ";"
     #les 3 métriques de centralités (pour chaque mot : dictionnaire)
     self.DEGCEN = {}
     self.CLOCEN = {}
     self.BETCEN = {}
     #le thème du corpus
     self.THEME = theme
     #nombre de documents du corpus
     self.NB_DOCS = nb_docs
     #le corpus
     self.corpus = Corpus(theme)
     self.corpus.download_collection(nb_docs, keyword=theme)
     self.A = self.corpus.get_adjacency_matrix()
def main(rootDirectory, words, confidenceLevel, functionType):
    corpus = Corpus(rootDirectory, toLowercase=TO_LOWERCASE, filters=FILTERS)
    #sampler = Sampler(SAMPLE_SIZE, sampleLength=SAMPLE_LENGTH)
    sampler = Sampler(SAMPLE_SIZE,
                      sampleLengthPercentage=SAMPLE_LENGTH_PERCENTAGE)

    documentSamples = {}
    for documentTitle in corpus.documents:
        documentSample = sampler.sample(corpus.documents[documentTitle],
                                        usePercentage=True)
        documentSamples[documentTitle] = documentSample

    wordCounter = WordCounter(words, SAMPLE_SIZE)
    wordCounter.countOccurrences(documentSamples)

    dataLabels = sorted(list(wordCounter.occurrences.keys()))
    dataSets = []
    for dataLabel in dataLabels:
        #dataSet = wordCounter.occurrences[dataLabel]
        dataSet = wordCounter.occurrencesPerMillionWords[dataLabel]
        dataSets.append(dataSet)

    statisticsPlotter = StatisticsPlotter(dataLabels, dataSets, CONFIDENCE,
                                          words)
    statisticsPlotter.plotStatistics(functionType=functionType)
Esempio n. 13
0
def main():
  """Sanity checks the edit model on the word 'hi'."""

  trainPath = 'data/tagged-train.dat'
  trainingCorpus = Corpus(trainPath)
  editModel = EditModel("data/count_1edit.txt", trainingCorpus)
  #These are for testing, you can ignore them
  DELETE_EDITS = set(['Edit(editedWord=i, rule=<h|<)', 'Edit(editedWord=h, rule=hi|h)'])
  INSERT_EDITS = set([Edit('ahi','<','<a'),Edit('bhi','<','<b'),Edit('chi','<','<c'),Edit('dhi','<','<d'),Edit('ehi','<','<e'),Edit('fhi','<','<f'),Edit('ghi','<','<g'),Edit('hhi','<','<h'),Edit('ihi','<','<i'),Edit('jhi','<','<j'),Edit('khi','<','<k'),Edit('lhi','<','<l'),Edit('mhi','<','<m'),Edit('nhi','<','<n'),Edit('ohi','<','<o'),Edit('phi','<','<p'),Edit('qhi','<','<q'),
    Edit('rhi','<','<r'),Edit('shi','<','<s'),Edit('thi','<','<t'),Edit('uhi','<','<u'),Edit('vhi','<','<v'),Edit('whi','<','<w'),Edit('xhi','<','<x'),Edit('yhi','<','<y'),Edit('zhi','<','<z'),Edit('hai','h','ha'),Edit('hbi','h','hb'),Edit('hci','h','hc'),Edit('hdi','h','hd'),Edit('hei','h','he'),Edit('hfi','h','hf'),Edit('hgi','h','hg'),Edit('hhi','h','hh'),
    Edit('hii','h','hi'),Edit('hji','h','hj'),Edit('hki','h','hk'),Edit('hli','h','hl'),Edit('hmi','h','hm'),Edit('hni','h','hn'),Edit('hoi','h','ho'),Edit('hpi','h','hp'),Edit('hqi','h','hq'),Edit('hri','h','hr'),Edit('hsi','h','hs'),Edit('hti','h','ht'),Edit('hui','h','hu'),Edit('hvi','h','hv'),Edit('hwi','h','hw'),Edit('hxi','h','hx'),Edit('hyi','h','hy'),Edit('hzi','h','hz'),
    Edit('hia','i','ia'),Edit('hib','i','ib'),Edit('hic','i','ic'),Edit('hid','i','id'),Edit('hie','i','ie'),Edit('hif','i','if'),Edit('hig','i','ig'),Edit('hih','i','ih'),Edit('hii','i','ii'),Edit('hij','i','ij'),Edit('hik','i','ik'),Edit('hil','i','il'),Edit('him','i','im'),Edit('hin','i','in'),Edit('hio','i','io'),Edit('hip','i','ip'),Edit('hiq','i','iq'),Edit('hir','i','ir'),
    Edit('his','i','is'),Edit('hit','i','it'),Edit('hiu','i','iu'),Edit('hiv','i','iv'),Edit('hiw','i','iw'),Edit('hix','i','ix'),Edit('hiy','i','iy'),Edit('hiz','i','iz')])
  TRANPOSE_EDITS = set([Edit('ih','hi','ih')])
  REPLACE_EDITS = set([Edit('ai','h','a'),Edit('bi','h','b'),Edit('ci','h','c'),Edit('di','h','d'),Edit('ei','h','e'),Edit('fi','h','f'),Edit('gi','h','g'),Edit('ii','h','i'),Edit('ji','h','j'),
    Edit('ki','h','k'),Edit('li','h','l'),Edit('mi','h','m'),Edit('ni','h','n'),Edit('oi','h','o'),Edit('pi','h','p'),Edit('qi','h','q'),Edit('ri','h','r'),Edit('si','h','s'),Edit('ti','h','t'),
    Edit('ui','h','u'),Edit('vi','h','v'),Edit('wi','h','w'),Edit('xi','h','x'),Edit('yi','h','y'),Edit('zi','h','z'),Edit('ha','i','a'),Edit('hb','i','b'),Edit('hc','i','c'),Edit('hd','i','d'),Edit('he','i','e'),Edit('hf','i','f'),Edit('hg','i','g'),Edit('hh','i','h'),Edit('hj','i','j'),
    Edit('hk','i','k'),Edit('hl','i','l'),Edit('hm','i','m'),Edit('hn','i','n'),Edit('ho','i','o'),Edit('hp','i','p'),Edit('hq','i','q'),Edit('hr','i','r'),Edit('hs','i','s'),Edit('ht','i','t'),
    Edit('hu','i','u'),Edit('hv','i','v'),Edit('hw','i','w'),Edit('hx','i','x'),Edit('hy','i','y'),Edit('hz','i','z')])

  print "***Code Sanity Check***"
  print "Delete edits for 'hi'"
  checkOverlap(set(editModel.deleteEdits('hi')), DELETE_EDITS)
  print "Insert edits for 'hi'"
  checkOverlap(set(editModel.insertEdits('hi')), INSERT_EDITS)
  print "Transpose edits for 'hi'"
  checkOverlap(set(editModel.transposeEdits('hi')), TRANPOSE_EDITS)
  print "Replace edits for 'hi'"
  checkOverlap(set(editModel.replaceEdits('hi')), REPLACE_EDITS)
Esempio n. 14
0
def test_reverse_markov_dict():
    full_markov_dict = MarkovDict(source=None, depth=1)
    corpus = Corpus(importStrFromFile("./test/corpora/reverse.txt"))
    full_markov_dict.add(corpus)
    bot = MarkovBot(full_markov_dict)
    print(bot.forward_dict.dict)
    print(bot.reverse_dict.dict)
Esempio n. 15
0
 def train(self, numIterations=100, testCorpusPath=None):
     if testCorpusPath:
         testCorpus = Corpus(testCorpusPath)
     for i in range(1, numIterations + 1):
         self.algorithm.train()  # call train method from algorithm
         if i % 10 == 0:
             # trainEval = Evaluation(self.algorithm.corpus)
             # print "Training evaluation for", i, "iteration(s):\n", trainEval.format()
             # self.algorithm.corpus.resetSentStats()
             if testCorpusPath:
                 self.setPredictedTags(testCorpus)
                 testEval = Evaluation(testCorpus)
                 print "Testing evaluation for", i, "iteration(s):\n", testEval.format(
                 )
                 testCorpus.resetSentStats(
                 )  # !!! we can use prototype pattern(so we don't need to loop through sents): here testCorpus = testCorpus.getPrototype() and in Corpus::__init__ : self.prototype = self (google : python prototype)?
Esempio n. 16
0
def main():
    """Trains all of the language models and tests them on the dev data. Change devPath if you
     wish to do things like test on the training data."""

    # load training data
    trainPath = 'data/tagged-train.dat'
    trainingCorpus = Corpus(trainPath)

    # load dev data
    devPath = 'data/tagged-dev.dat'
    devCorpus = Corpus(devPath)

    print 'Unigram Language Model: '
    unigramLM = UnigramModel(trainingCorpus)
    unigramSpell = SpellCorrect(unigramLM, trainingCorpus)
    unigramOutcome = unigramSpell.evaluate(devCorpus)
    print str(unigramOutcome)

    print 'Uniform Language Model: '
    uniformLM = UniformModel(trainingCorpus)
    uniformSpell = SpellCorrect(uniformLM, trainingCorpus)
    uniformOutcome = uniformSpell.evaluate(devCorpus)
    print str(uniformOutcome)

    print 'Smooth Unigram Language Model: '
    smoothUnigramLM = SmoothUnigramModel(trainingCorpus)
    smoothUnigramSpell = SpellCorrect(smoothUnigramLM, trainingCorpus)
    smoothUnigramOutcome = smoothUnigramSpell.evaluate(devCorpus)
    print str(smoothUnigramOutcome)

    print 'Smooth Bigram Language Model: '
    smoothBigramLM = SmoothBigramModel(trainingCorpus)
    smoothBigramSpell = SpellCorrect(smoothBigramLM, trainingCorpus)
    smoothBigramOutcome = smoothBigramSpell.evaluate(devCorpus)
    print str(smoothBigramOutcome)

    print 'Backoff Language Model: '
    backoffLM = BackoffModel(trainingCorpus)
    backoffSpell = SpellCorrect(backoffLM, trainingCorpus)
    backoffOutcome = backoffSpell.evaluate(devCorpus)
    print str(backoffOutcome)

    print 'Custom Language Model: '
    customLM = CustomModel(trainingCorpus)
    customSpell = SpellCorrect(customLM, trainingCorpus)
    customOutcome = customSpell.evaluate(devCorpus)
    print str(customOutcome)
Esempio n. 17
0
def main():
    if (len(sys.argv) != 6):
        print "usage: python main.py <init_alpha> <modeldir_name> <num_topic> <data_file> <random/load>"
        sys.exit(1)

    init_alpha = float(sys.argv[1])
    directory = sys.argv[2]
    num_topics = int(sys.argv[3])
    data_file = sys.argv[4]
    start_type = sys.argv[5]

    # read_data
    corpus = Corpus()
    corpus.read_data(data_file)

    # Run LDA
    LdaEstimator.run_EM(init_alpha, directory, num_topics, corpus, start_type)
Esempio n. 18
0
def main():

	if len(sys.argv) != 3:
		print "Please provide paths to train and test corpora!"
	else:

		training_corpus = Corpus(sys.argv[1])
		test_corpus = Corpus(sys.argv[2])
		len_pos_train = len(training_corpus.generate_pos_pairs())
		len_neg_train = len(training_corpus.generate_neg_pairs())
		training_corpus.create_mallet_file("training_file_mallet.txt")

		len_test = len(test_corpus.generate_pos_pairs()) + len(test_corpus.generate_neg_pairs())
		test_corpus.create_test_file("test_file_mallet.txt")


		print "There are " + str(len_pos_train) + " positive training instances and " + str(len_neg_train) + " negative training instances."
		print "There are " + str(len_test) + " test instances."
Esempio n. 19
0
    def __init__(self, clustered_corpus):
        self.corpora = []
        for cluster in clustered_corpus:
            corpus = Corpus(cluster)
            self.corpora.append(corpus)
        if len(self.corpora) < 2:
            raise ValueError("clustered_corpus argument is not clustered")

        self.candidate_to_cu_mapping = self.calculate_cus_for()
Esempio n. 20
0
        def train(self,path_to_truth_dir):
                corpus = Corpus(path_to_truth_dir)
                #Read truth file
                truth = methods.read_classification_from_file(methods.add_slash(path_to_truth_dir)+"!truth.txt")
                #Make truth global
                self.truth = truth
                for fname, body in corpus.emails_as_string():
                        email_as_file = open(methods.add_slash(path_to_truth_dir) + fname,'r',encoding = 'utf-8')
                        #Read email with EMAIL parser
                        msg = email.message_from_file(email_as_file)
                        self.extract_senders_list(msg,fname)
                        self.check_subject(msg,fname)

                #Generate dict's
                methods.generate_file_from_dict(self.path_bl , self.black_list)
                methods.generate_file_from_dict(self.path_wl ,self.white_list)
                methods.generate_file_from_dict(self.path_ssl , self.spam_subject_list)
                methods.generate_file_from_dict(self.path_hsl ,self.ham_subject_list)
Esempio n. 21
0
    def generate_feature_csv(self, feature_csv, pos_lexicon, neg_lexicon, postag_instances=None):
        """
         Generates a csv file with features extracted from instances according to data-driven DD model
        :param feature_csv:
        :param pos_lexicon:
        :param neg_lexicon:
        :param postag_instances:
        :return:
        """
        if postag_instances:
            corpus_postag_set = Corpus.get_postag_set(postag_instances) # return all tags in corpus in a list
        else:
            corpus_postag_set = Corpus.get_postag_set(self.instances) # return all tags in corpus in a list

        # feature file header: ID, text, pos_feature, neg_feature, percentages for all corpus tags, label
        with open(feature_csv, 'wb') as f:
            wr = csv.writer(f)
            id = 1
            wr.writerow(["ID", "text", "pos", "neg"]+corpus_postag_set+["label"])
            for inst in self.instances:
                inst_postags = [token.get_tag() for token in inst.get_tokens()]
                inst_postag_counter = Counter(inst_postags)
                postag_percent = []
                for tag in corpus_postag_set:
                    if tag in inst_postag_counter:
                        # percentage of words belonging to each POS in instance
                        postag_percent.append(inst_postag_counter[tag]/inst.get_length())
                    else:
                        postag_percent.append(0)
                # tokens_list = [token.get_text() for token in inst.get_tokens()]
                tokens_list = [token for token in inst.get_tokens()] # tokens as objects
                pos_neg_list = self.get_lexicon_features(tokens_list, pos_lexicon, neg_lexicon)
                # wr.writerow([id, inst.get_text(), pos_neg_list[0], pos_neg_list[1]]+postag_percent+[inst.get_label_gold()])
                wr.writerow(
                    [unicode(id).encode("utf-8"),
                     unicode(inst.get_text()).encode("utf-8"),
                     unicode(pos_neg_list[0]).encode("utf-8"),
                     unicode(pos_neg_list[1]).encode("utf-8")]
                    + postag_percent
                    + [unicode(inst.get_label_gold()).encode("utf-8")])
                id += 1
        return feature_csv, corpus_postag_set
Esempio n. 22
0
def test():
    c1 = Corpus("Big round boulder. That is a round snake.")
    c2 = Corpus("The dog is fat. The dog eats food. My dog is yellow. Your cat is yellow.")
    c3 = Corpus("Look out! Look behind you. Are you there? Are you okay? To you, I defer.")

    m1 = MarkovDict(c1)
    m2 = MarkovDict(c2, 2)
    m3 = MarkovDict(c3)

    print ("m1:", m1.response())
    print ("m1:", m1.response())
    print ("m1:", m1.response())

    print ("m2:", m2.response())
    print ("m2:", m2.response())
    print ("m2:", m2.response())

    print ("m3:", m3.response())
    print ("m3:", m3.response())
    print ("m3:", m3.response())
Esempio n. 23
0
def downloadCorpus(snapshotDir, corpusDir, projectName, configInfo):

    # 2. Dump the snapshots for a project
    msg = '---------------------------------------------------- \n'
    msg += ' Dump the corpus for project %s \n' % projectName
    msg += '---------------------------------------------------- \n'
    print(msg)

    project_snapshot_dir = os.path.join(snapshotDir, projectName)
    project_corpus_dir = os.path.join(corpusDir, projectName)

    if os.path.isdir(project_corpus_dir):
        print "!! %s already exists...returning \n" % project_corpus_dir
        #return

    corpus = Corpus(project_snapshot_dir, 'java', project_corpus_dir,
                    configInfo)
    #logging.debug(corpus)
    #print corpus
    corpus.dump()
Esempio n. 24
0
def main():
    clustered_corpus_path = 'clustered_corpus'
    clustered_corpus = read_clustered_corpus(clustered_corpus_path)
    corpus = merge_clustered_corpus_into_a_single_corpus(clustered_corpus)

    target_file_path = 'target.txt'
    text = read_text_file(target_file_path)
    document = Document(text)

    corpus = Corpus(corpus)
    clustered_corpus = ClusteredCorpus(clustered_corpus)

    candidate_to_rank_mapping = {}
    candidate_to_params_mapping = {}
    candidate_to_dfs_in_each_cluster_mapping = {}

    for candidate in document.get_candidates():
        tf = math.log(1.0 + document.get_tf_for(candidate), 10.0)
        # tf = document.get_tf_for(candidate)
        idf = math.log(1.0 + 1.0 / corpus.get_df_for(candidate), 2.0)
        cu = clustered_corpus.get_cu_for(candidate)

        rank = cu
        # rank = tf * cu
        # rank = tf * idf

        dfs_in_each_cluster = clustered_corpus.get_dfs_in_each_cluster_for(candidate)

        candidate_representative = corpus.get_representative_for(candidate)
        candidate_to_rank_mapping[candidate_representative] = rank
        candidate_to_params_mapping[candidate_representative] = (tf, idf, cu)
        candidate_to_dfs_in_each_cluster_mapping[candidate_representative] = dfs_in_each_cluster

    table = generate_table_based_on(
        candidate_to_rank_mapping,
        candidate_to_params_mapping,
        candidate_to_dfs_in_each_cluster_mapping
    )

    save_as_file(table)
    print('Done.')
Esempio n. 25
0
 def __init__(self,
              text,
              keywords=None,
              remove_stopword=True,
              with_segs=False):
     self.text = text
     self.corpus = Corpus(text,
                          keywords=keywords,
                          remove_stopword=remove_stopword,
                          with_segs=with_segs)
     self.network = nx.Graph()
     self.build_network()
    def generate_combined_features(self, feature_csv):
        feature_rows = pd.read_csv(feature_csv)
        # Create vectorizer for function to use
        vectorizer = CountVectorizer(binary=True, ngram_range=(1, 2))
        y = feature_rows["label"].values.astype(np.float32)

        X = sp.sparse.hstack(
            (vectorizer.fit_transform(feature_rows.text),
             feature_rows[['pos', 'neg'] +
                          Corpus.get_postag_set(self.instances)].values),
            format='csr')
        return X, y, vectorizer
Esempio n. 27
0
def main():
    def get_data():
        client = pymongo.MongoClient()
        db = client.twitter4
        cursor = db.stream.aggregate([
            {'$match': {
                'date': {
                    '$gt': datetime.datetime(2015, 11, 13)
                }
            }},
            {'$sort': {'date': 1}},
            {'$project': {'text': 1, 'date': 1}},
        ])
        return cursor

    def get_remote_data():
        client = pymongo.MongoClient(host='59.77.134.176')
        db = client.twitter3
        cursor = db.stream.aggregate([
            # {'$sort': {'date': 1}},
            {'$project': {'text': 1}},
        ])
        return cursor

    cursor = get_data()
    print 'calculate_entropy 多个词只算1次'
    olda = None
    reallen = 0
    # for chunk_no, doc_chunk in enumerate(cursor_serial(cursor, 3000)):
    for chunk_no, doc_chunk in enumerate(chunkize_serial(cursor, 3000, as_numpy=False)):
        print doc_chunk[0]['date']
        doc_chunk = [tweet['text'] for tweet in doc_chunk]

        reallen += len(doc_chunk)

        print chunk_no, reallen - len(doc_chunk), reallen, len(doc_chunk), 'lda'
        start = datetime.datetime.now()
        if not olda:
            corpus = Corpus(doc_chunk)
            olda = OnlineLDA(corpus, K=10)
        else:
            olda.fit(doc_chunk)
        # Give them to online LDA

        print datetime.datetime.now() - start
        with codecs.open(r'G:\test18.out', "w", "utf-8-sig") as f:
            for topic_id, (topic_likelihood, topic_words, topic_tweets) in olda.get_lda_info():
                print '{}%\t{}'.format(round(topic_likelihood * 100, 2), topic_words)
                print '\t', topic_tweets
                f.write(topic_tweets + '\n')

        print '\n\n\n\n\n\n'
Esempio n. 28
0
    def createWordLookup(self, foreignSentence):
        corpus = Corpus()
        tokenDictList = []

        """Captures only words, no spaces/punctuation"""
        spanishTokens = re.compile('(\W+)', re.UNICODE).split(unicode(foreignSentence, 'utf-8'))
        spanishTokens.pop()
        
        for idx, token in enumerate(spanishTokens):
            tokenDict = dict()
            tokenDict['originalToken'] = token
            tokenDict['spanish_POS'] = corpus.spanishTags().get(token, None)
            if (len(token) > 0):
                if token[0].isupper():
                    tokenDict['upper'] = True
                else:
                    tokenDict['upper'] = False
            else:
                tokenDict['upper'] = False
            tokenDictList.append(tokenDict)
            
        self.tokenDictList = tokenDictList
Esempio n. 29
0
 def __init__(self,
              text,
              keywords=None,
              remove_stopword=True,
              with_segs=False,
              weight_type='count'):
     self.text = text
     self.corpus = Corpus(text,
                          keywords=keywords,
                          remove_stopword=remove_stopword,
                          with_segs=with_segs)
     self.network = nx.Graph()
     self._network(weight_type)
    def generate_feature_csv(self,
                             feature_csv,
                             pos_lexicon,
                             neg_lexicon,
                             postag_instances=None):
        if postag_instances:
            corpus_postag_set = Corpus.get_postag_set(
                postag_instances)  # return all tags in corpus in a list
        else:
            corpus_postag_set = Corpus.get_postag_set(
                self.instances)  # return all tags in corpus in a list

        # ID, text, pos_feature, neg_feature, percentages for all corpus tags, label
        with open(feature_csv, 'wb') as f:
            # wr = csv.writer(f, quoting=csv.QUOTE_ALL)
            wr = csv.writer(f)
            id = 1
            wr.writerow(["ID", "text", "pos", "neg"] + corpus_postag_set +
                        ["label"])
            for inst in self.instances:
                inst_postags = [token.get_tag() for token in inst.get_tokens()]
                inst_postag_counter = Counter(inst_postags)
                postag_percent = []
                for tag in corpus_postag_set:
                    if tag in inst_postag_counter:
                        # percentage of words belonging to each POS in instance
                        postag_percent.append(inst_postag_counter[tag] /
                                              inst.get_length())
                    else:
                        postag_percent.append(0)
                pos_neg_list = self.get_lexicon_features(
                    inst.get_text(), pos_lexicon, neg_lexicon)
                wr.writerow(
                    [id, inst.get_text(), pos_neg_list[0], pos_neg_list[1]] +
                    postag_percent + [inst.get_label_gold()])
                id += 1
        return feature_csv, corpus_postag_set
Esempio n. 31
0
def create_corpus():
    corpus = Corpus()
    for folder in glob.iglob('texts/*'):
        for filename in glob.iglob(folder + "/*"):
            corpus.add_document(Document(filename))
            # corpus.add_document(Document(folder))

    corpus.build_vocabulary()
    return corpus
Esempio n. 32
0
        def test(self, path_to_test_dir):
                predictions = {} #Predictions dict {fname:prediction}
                bs = Bayesian.Bayesian()
                corpus = Corpus(path_to_test_dir)
                #Read dict's (if test called before train)
                black_list_dict = methods.read_dict_from_file(self.path_bl)
                white_list_dict = methods.read_dict_from_file(self.path_wl)
                spam_subject_dict = methods.read_dict_from_file(self.path_ssl)
                ham_subject_dict = methods.read_dict_from_file(self.path_hsl)
                
                for fname, body in corpus.emails_as_string():
                        #Open email with parser
                        email_as_file = open(methods.add_slash(path_to_test_dir) + fname,'r',encoding = 'utf-8')
                        msg = email.message_from_file(email_as_file)

                        #Check if sender in a black list
                        if (self.extract_email_adress_from_text(msg['From']) in black_list_dict):
                                predictions[fname] = 'SPAM'
                        elif(self.extract_email_adress_from_text(msg['From']) in white_list_dict):
                        #Check if sender in a white list
                                predictions[fname] = 'OK'
                        #Check if subject in a black list
                        elif(self.extract_email_adress_from_text(msg['From']) in spam_subject_dict):
                             prediction[fname] = 'SPAM'
                        #Check if subject in a white list
                        elif(self.extract_email_adress_from_text(msg['From']) in ham_subject_dict):
                                prediction[fname] = 'OK'
                        #Run Bayesian checker
                        else:                
                                if (bs.bayesian_prediction(methods.get_text(msg))) > 0.485:
                                        predictions[fname] = 'SPAM'
                                else:
                                        predictions[fname] = 'OK'

                #Generate prediction file
                bf = BaseFilter(path_to_test_dir,predictions)
                bf.generate_prediction_file()
Esempio n. 33
0
    def get_description(self):
        '''
        This function gets all the url, finds their description text
        and update them to the database
        '''
        #get doc_id
        self.mycursor.execute("select id,url from doc")
        myresult = self.mycursor.fetchall()
        for doc_id, url in myresult:
            #print("**********Doc ID is "+str(doc_id)+" ********")
            c = Corpus()
            name = c.url_to_dir(url)
            #print("Name is "+ name)
            with open(name, "rb") as file:
                content = file.read()
                soup = BeautifulSoup(content, "lxml")
                metas = soup.find_all("meta")
                result = ''
                for meta in metas:
                    if ('content' in meta.attrs) and ('name' in meta.attrs) and \
                       ((meta.attrs['name'] == 'description') or (meta.attrs['name'] == 'keywords')):
                        result = " ".join(meta.attrs['content'].split())

                #if html doesn't have description tag
                if result == '':
                    script = soup.find(
                        ["h1", "h2", "h3", "h4", "h5", "strong", "title", "b"])
                    if script:
                        temp = " ".join(script.text.split())
                        result += temp if len(temp) < 200 else ""
                print(result)
                i_sql = "update doc set description =%s where id = %s"
                i_val = (result, doc_id)
                self.mycursor.execute(i_sql, i_val)
                self.mydb.commit()
                print(self.mycursor.rowcount,
                      "was inserted in DOC , DOC ID IS " + str(doc_id))
Esempio n. 34
0
 def __init__(self):
     self.emotion = {
         'surprise': 0,
         'anger': 1,
         'happy': 2,
         'love': 3,
         'fear': 4,
         'trust': 5,
         'disgust': 6,
         'sad': 7
     }
     self.tp = 0
     self.tn = 0
     self.fp = 0
     self.fn = 0
     self.accuracy = 0
     self.precision = 0
     self.recall = 0
     self.F1 = 0
     # self.list_gold = list_gold
     # self.list_prediction = list_prediction
     cp = Corpus()
     self.list_gold = cp.read_label()
     self.list_prediction = cp.read_prediction()
Esempio n. 35
0
def test():
    print("Creating new markov dict...")
    print(getDepth())
    corpus_path = "./corpora/test/depth.txt"
    corpus = Corpus(importStrFromFile(corpus_path))
    print(corpus)
    reverse_corpus = list(reversed(corpus))
    print(reverse_corpus)
    forward_markov_dict = MarkovDict(source=corpus, depth=getDepth())
    reverse_markov_dict = MarkovDict(source=reverse_corpus, depth=getDepth())
    pprint(forward_markov_dict.dict)
    pprint(reverse_markov_dict.dict)
    bot = MarkovBot(forward_markov_dict, reverse_markov_dict)
    # pprint(bot.forward_dict.dict)
    # pprint(bot.reverse_dict.dict)
    print(bot.response(topic="markov"))
Esempio n. 36
0
def main():
    print("Creating new markov dict...")
    forward_markov_dict = MarkovDict(source=None, depth=getDepth())
    reverse_markov_dict = MarkovDict(source=None, depth=getDepth())
    print("Starting for loop to add corpora...")
    for corpus_path in corporaPaths():
        corpus = Corpus(importStrFromFile(corpus_path))
        print("Adding corpus with path '" + corpus_path + "'...")
        forward_markov_dict.add(corpus)
        reverse_markov_dict.add(list(reversed(corpus)))
    print("Initializing MarkovBot...")
    bot = MarkovBot(forward_markov_dict, reverse_markov_dict)
    print("\nWelcome to MarkovBot! Type a message. Type 'exit()' to quit.")
    message = prompt()
    while message != "exit()":
        print(bot.response(topic=message.split()[0]))
        message = prompt()
Esempio n. 37
0
    def __init__(self, data_path, corpus_file):
        """
		WorbEmb class init

		Parameters
		----------
		data_path : str
			data full path
		corpus_file : str
			protein domain corpus file name

		Returns
		-------
		None
		"""
        self.data_path = data_path
        self.corpus_file = corpus_file
        self.Corpus = Corpus(self.data_path, self.corpus_file)
        self.w2v_model = "none"
        self.w2v_file_out = ""
Esempio n. 38
0
class Searcher:
    def __init__(self, raw_documents):
        self.corpus = Corpus(raw_documents)
        self.metrics = Metrics(self.corpus)
    
    def search(self, query):
        results = []
        query_document = Document(query)
        query_stems = query_document.get_stems()
        documents = self.corpus.get_documents()
        for doc in documents:
            document_id = doc.get_id()
            score = 0.0
            stemmed_document = doc.get_stems()
            for qstem in query_stems:
                if qstem in stemmed_document:
                    term_frequency = self.metrics.get_term_frequency(document_id, qstem)
                    score += term_frequency
            if score > 0.0:
                results.append({"id": doc.get_id(), "score": score, "text": doc.get_text()})
        return results

    def get_corpus(self):
        return self.corpus
Esempio n. 39
0
 def __init__(self, raw_documents):
     self.corpus = Corpus(raw_documents)
     self.metrics = Metrics(self.corpus)
Esempio n. 40
0
	else:
		termino = load_terminology(config.termino_path)
		print "\nTerminology loaded"
	
	print "\nLoading %i files"%nb_files
	print "-------------------------------------------"
	
	docs = []
	
	# Load the files
	for i, f in enumerate(files):
		sys.stdout.write( "\r%3i/%i %s"%( i+1, nb_files, '{:<70}'.format(f) ) )
		sys.stdout.flush()
		docs.append(Document(f))
	
	corpus = Corpus(docs, termino)
	
	print "\n\nCorpus preprocessing"
	print "-------------------------------------------"
	corpus.preprocess()
	
	print "\n\nExtracting the keywords"
	print "-------------------------------------------"
	corpus.process()
	
	if Config().testing:
		print "\n\nResults (%s average)"%("Macro" if config.macro_average else "Micro")
		print "-------------------------------------------"
		corpus.results()
	else:
		print "\n"
Esempio n. 41
0
# -*- encoding: utf-8 -*-
# -*- coding: utf-8 -*-

import sys
import extractor
from Corpus import Corpus


if __name__ == "__main__":

    dbname, collname = sys.argv[1], sys.argv[2]
    corpus_db = Corpus(database=dbname, collection=collname)

    df_dbname, df_collname = dbname, sys.argv[3]

    df = {}
    for j, item in enumerate(corpus_db.find({})):
        for word in set( extractor.getwords(item["text"]) ):
            extractor.countup(df, word)

    with file(df_collname, "w") as opened:
        for word, freq in sorted(df.items(), key=lambda x:x[1], reverse=True):
            opened.write("%s\t%d\n" % (word, freq))


Esempio n. 42
0
    parser = argparse.ArgumentParser(description="says")

    parser.add_argument("-d", "--database", default="says")
    
    parser.add_argument("-i", "--items", default="items")
    parser.add_argument("-s", "--itemstats", default="itemstats")

    args = parser.parse_args()
    return args


if __name__ == "__main__":

    args = parse_args()

    db = Corpus(database=args.database, collection=args.items)
    db_stats = Corpus(database=args.database, collection=args.itemstats)
    
    try:
        latstats = db_stats.findsorted({}, key="id")[0]["id"]
    except IndexError:
        latstats = 0L

    for i, item in enumerate(db.find({ "id": { "$gt": latstats }})):

        words = extractd.getwords(item)
        messages = extractd.getmessages(item)
        tags = extractd.gethashtags(item)
        urls = extractd.geturls(item)
        
        db_stats.append({
Esempio n. 43
0
            uid = extractd.getid(n2i, u)
            vid = extractd.getid(n2i, v)

            graph.add_edge(uid, vid)
            
            extractd.countup(weights, (uid, vid))
            extractd.countup(weights, (vid, uid))

    with file('%s.wpairs' % sys.argv[1], 'w') as opened:

        for e in graph.edges():
            w = weights[(e[0], e[1])] if weights[(e[0], e[1])] <= weights[(e[1], e[0])] else weights[(e[1], e[0])]
            opened.write( '%d\t%d\t%d\n' % (e[0], e[1], w) )

    with file('%s.n2i' % sys.argv[1], 'w') as opened:
        for u in n2i:
            opened.write('%s\t%d\n' % (u, n2i[u]))


if __name__ == '__main__':
    
    dbinfo = Pit.get("says")
    db = Corpus(database=dbinfo["db"], collection=dbinfo["items"])

    t_end = time.mktime( datetime.today().timetuple() )
    t_begin = t_end - (24 * 60 * 60 * 10)
 
    items = [ item for item in db.find({'created_at': { '$gt': t_begin, '$lt': t_end }}) ]

    make_graph(items)
Esempio n. 44
0

def parse_args():

    usage = "[--interval] [interval] [-l] [path-to-log]"
   
    parser = argparse.ArgumentParser(description="says")
    parser.add_argument("--interval", type=float, default=1.0)
    parser.add_argument("-l", "--log", default=".log/log")

    args = parser.parse_args()
    return args


if __name__ == "__main__":

    args = parse_args()

    dbinfo = Pit.get("says")

    users_db = Corpus(database=dbinfo["db"], collection=dbinfo["users"])
    #users = users_db.find({})
    users = [ item["screen_name"] for item in users_db.find({}) ]

    api = activate_api()

    items_db = Corpus(database=dbinfo["db"], collection=dbinfo["items"])

    getitems(users, api, items_db)

Esempio n. 45
0
def shell(filelimit = 0):       
    #rootpath = "/home/dicle/Dicle/Tez/dataset/readingtest30/"
    corpuspath = "/home/dicle/Dicle/Tez/dataset/readingtest300/"
    rootpath = corpuspath
    folders = IOtools.getfoldernames_of_dir(corpuspath)
    foldername = ""
    corpus = Corpus(rootpath)
    singlefolder = False
    if len(folders) == 0:
        singlefolder = True
    
    
    if singlefolder:                                                    
        rootpath = corpuspath 
        #corpus = Corpus(rootpath, foldername)
        starttime = datetime.now()
        buildcorpus(corpus, rootpath, filelimit)
        endtime_buildcorpus = datetime.now()
        print "build corpus took: ",str(endtime_buildcorpus - starttime)
        print "corpus length ",str(len(corpus.words))," words"
    
    else:     
        for foldername in folders:
            
            print "Folder: ",foldername
            rootpath = corpuspath + os.sep + foldername + os.sep
            
            #corpus = Corpus(rootpath, foldername)
            
            starttime = datetime.now()
            
            buildcorpus(corpus, rootpath)
            endtime_buildcorpus = datetime.now()
            print "build corpus took: ",str(endtime_buildcorpus - starttime)
            print "corpus length ",str(len(corpus.words))," words"
            
    print "pickle-getting words"
    corpus.picklegetwords()    
    print "assigning pos tags" 
    assignPOStags(corpus)
    endtime_postags = datetime.now()
    print "postag assignment took: ",str(endtime_postags - endtime_buildcorpus)
    
    
    '''
    get_magnitudewords_doc_matrix(corpus)
    
    adjectives = get_words_ofPOStag(corpus, "ADJ")
    print "numof adjectives, ",len(adjectives),"  ",adjectives[:-10]
    get_docterm_matrix(corpus, adjectives, "adjective-doc-matrix.txt", record = True)
    '''
    endtime = datetime.now()
    passtime = endtime - starttime
    print "Elapsed time: ",passtime," on folder ",foldername
    
    print "pickle-dumping words"
    endtimep = datetime.now() 
    corpus.pickledumpwords()   
    print "Corpus length: ",len(corpus.words)  
    print "Elapsed time for pickle: ",str(endtimep - endtime)
    
    # PICKLE words
    print "pickle-getting words"
    corpus.picklegetwords()

    print "corpus first 20 words:"
    for word in corpus.words[:20]:
        word.toscreen()
        
    print "pickle-dumping words"
    corpus.pickledumpwords()
Esempio n. 46
0
        self.num_word = corpus.getWordNum()
        self.pz_w = self._rand_mat(self.num_topic, self.num_word)
        print self.pz_w

    def fit(self):
        """
        训练pLSA
        [
            [token1, token2],
            [token1, token2, token3, token4]
        ]
        :return:
        """
        pass

    def transform(self):
        # no use
        pass

    def _rand_mat(self, sizex, sizey):
        mat = np.random.rand(sizex, sizey)
        for r in range(sizex):
            s = np.sum(mat[r])
            for c in range(sizey):
                mat[r][c] = mat[r][c] / s
        return mat

if __name__ == "__main__":
    corpus = Corpus("../data/topic/corpus")
    print corpus.getVocab()
    plsa = pLSA(corpus, 5)
Esempio n. 47
0
import time

start_time = time.time()

#Chemin pour l'accès aux fichiers constituants le corpus
path = 'docs/'

#On regarde tous les documents dans le dossier indiqué
corpus = os.listdir(path)

#On met les chemins complets (au lieu des noms des documents)
for i in range(0, len(corpus)):
	corpus[i] = path+corpus[i]

#On créé le corpus
corpus = Corpus(corpus, 'stopwords.txt', 'dico/')

cheminRequete = "requete"
requeteFile = open(cheminRequete, 'w', -1, 'utf-8')
for i in range(1, len(sys.argv)):
	requeteFile.write(sys.argv[i]+" ")
requeteFile.close()

corpus.lemmatiserCorpus(cheminRequete)
corpus.vectoriserDocCorpus()

#On prépare une variable pour le calcul de similarité
finale = {}
finale = corpus.calculSimilarite()

for i in range(0, len(finale)):
Esempio n. 48
0
from Corpus import Corpus
from Rule import PossibleRules

print("Analysis starting...")
train_corpus = Corpus(["./dataset/TrainingSet/file1.txt"
                 , "./dataset/TrainingSet/file2.txt"
                 , "./dataset/TrainingSet/file3.txt"
                 , "./dataset/TrainingSet/file4.txt"
                 , "./dataset/TrainingSet/file5.txt"
                 , "./dataset/TrainingSet/file6.txt"
                 , "./dataset/TrainingSet/file7.txt"
                 , "./dataset/TrainingSet/file8.txt"

                       ])

train_corpus.outputWords("./Output/MostLikelyMorphParseForWord.txt")
print("Most likely morphological parses for words are written to ./Output/MostLikelyMorphParseForWord.txt")

train_corpus.outputPOStags("./Output/MostLikelyTag.txt")
print("Most likely tags are written to ./Output/MostLikelyTag.txt")

train_corpus.tag_words_with_most_likely_parses()
tag_order = 1
print("TRAIN: Precision for DS" + str(tag_order) + " " + str(train_corpus.calculate_precision()))

print("Possible rules are generating...")
rules = PossibleRules(train_corpus.tags[:20]).rules  # just try first 20 words in the training corpus since it is expensive to walk through all the words
print(str(len(train_corpus.all_words_in_corpus)) + " words in training set.")

learned_rules_with_precision = []
for rule in rules: