def main(): """Trains all of the language models and tests them on the dev data. Change devPath if you wish to do things like test on the training data.""" trainPath = 'data/tagged-train.dat' trainingCorpus = Corpus(trainPath) devPath = 'data/tagged-dev.dat' devCorpus = Corpus(devPath) # print 'Unigram Language Model: ' # unigramLM = UnigramModel(trainingCorpus) # unigramSpell = SpellCorrect(unigramLM, trainingCorpus) # unigramOutcome = unigramSpell.evaluate(devCorpus) # print str(unigramOutcome) # print 'Uniform Language Model: ' # uniformLM = UniformModel(trainingCorpus) # uniformSpell = SpellCorrect(uniformLM, trainingCorpus) # uniformOutcome = uniformSpell.evaluate(devCorpus) # print str(uniformOutcome) # print 'Smooth Unigram Language Model: ' # smoothUnigramLM = SmoothUnigramModel(trainingCorpus) # smoothUnigramSpell = SpellCorrect(smoothUnigramLM, trainingCorpus) # smoothUnigramOutcome = smoothUnigramSpell.evaluate(devCorpus) # print str(smoothUnigramOutcome) print 'Smooth Bigram Language Model: ' smoothBigramLM = SmoothBigramModel(trainingCorpus) smoothBigramSpell = SpellCorrect(smoothBigramLM, trainingCorpus) smoothBigramOutcome = smoothBigramSpell.evaluate(devCorpus) print str(smoothBigramOutcome)
def build_index(self): ''' This function build the inverted index, it inserts the url to the doc Table with a doc_id, and insert each token to tokenT table and insert token, doc_id, term frequency and weight into the web_index Table ''' c = Corpus() t = Tokenizer() for url, name in c.get_file_name(): if len(url) > 1000: continue result = t.tokenize(name) if len(result) == 0: continue print(url) doc_id = 1 #Insert URL to table DOC sql = "INSERT INTO web.doc(url) values (%s)" val = (url, ) self.mycursor.execute(sql, val) self.mydb.commit() print(self.mycursor.rowcount, "was inserted in URL.") print(url) s_sql = "select id from doc where url=%s" self.mycursor.execute(s_sql, val) myresult = self.mycursor.fetchone() doc_id = myresult[0] print("DOC_ID IS " + str(doc_id)) #Insert token, doc_id, tf into web_index t_sql = "INSERT INTO web.web_index(token, doc_id, tf, wt) values (%s,%s,%s,%s)" t_val = [] for token in result.keys(): t_val.append( (token, doc_id, result[token][0], result[token][1])) #print(t_val) self.mycursor.executemany(t_sql, t_val) self.mydb.commit() print(self.mycursor.rowcount, "was inserted in WEB_INDEX.") #insert into TokenT table count = 0 for token in result.keys(): tq = "Insert ignore into tokenT values (%s)" tv = (token, ) self.mycursor.execute(tq, tv) self.mydb.commit() count += 1 print("inserted " + str(count) + " Tokens")
def searchengine(directory): stopWords = set(stopwords.words("english")) # stemming ps = PorterStemmer() # create InvertedIndex obj invertedIndex = InvertedIndex() # build the corpus Corp = Corpus() corpus = Corp.buildCorpus(directory) for docId in corpus: doc = corpus[docId] content = doc.getContent() # tokenize tokens = word_tokenize(content) for token in tokens: token = token.lower() # apply stemming token = ps.stem(token) # remove stopwords if token in stopWords: continue # add to index invertedIndex.addTerm(token, docId) return invertedIndex, corpus
def main2(): corpus = {} db = Corpus(database="sanal", collection=sys.argv[1]) query = {} for i, item in enumerate(db.find(query)): text = item["text"] words = getwords(unicode(text)) wordsd = {} for w in words: countup(wordsd, w) doc = { "text": wordsd, "id": item["id"] } #u = item["user"]["screen_name"] u = item["screen_name"] try: corpus[u].append(doc) except KeyError: corpus[u] = [ doc ] print(i) with file(sys.argv[2], "w") as opened: for k, v in corpus.items(): opened.write("%s\n" % json.dumps({k: v}))
def verificarPlagioTimeProfile(diretorioCorpus, diretorioDocumento, limiar): ''' Input: endereço para pasta contendo arquivos, endereço para um arquivo, limiar de contenção para verificação. Conta o tempo de instanciação de um objeto corpus, um objeto documento, e da verificação de plágio do documento em relação ao corpus. ''' c = Corpus(diretorioCorpus) doc = Documento(diretorioDocumento) return c.verificarPlagio(doc, limiar)
def __init__(self, file_text, doc_name, corpus, n_gram_length): Corpus.__init__(self, file_text, doc_name, n_gram_length) self.tf_idf = {} for n_gram in self.n_gram_count: tf = float(self.n_gram_count[n_gram]) idf = float(self.doc_length * (corpus.n_gram_count[n_gram] + 1)) self.tf_idf[n_gram] = log(tf / idf)
def verificarPlagioMemUsageProfile(diretorioCorpus, diretorioDocumento, limiar): ''' Input: endereço para pasta contendo arquivos, endereço para um arquivo, limiar de contenção para verificação. Calcula a memória usada pela instanciação de um objeto corpus, um objeto documento, e da verificação de plágio do documento em relação ao corpus. ''' c = Corpus(diretorioCorpus) doc = Documento(diretorioDocumento) return c.verificarPlagio(doc, limiar)
def __init__(self, verbose=False): print('Loading corpus ...') self.corpus = Corpus(verbose=verbose) self.corpus.create_data() self.X_seqs, self.y_seqs = self.corpus.X_seqs, self.corpus.y_seqs self.seq_count = len(self.X_seqs) for seq_idx in range(self.seq_count): assert (len(self.X_seqs[seq_idx]) == len(self.y_seqs[seq_idx])) self.feature_dim = len(self.X_seqs[0][0])
def train(self, numIterations=100, testCorpusPath=None): if testCorpusPath: testCorpus = Corpus(testCorpusPath) for i in range(1, numIterations + 1): self.algorithm.train() # call train method from algorithm if i % 10 == 0: # trainEval = Evaluation(self.algorithm.corpus) # print "Training evaluation for", i, "iteration(s):\n", trainEval.format() # self.algorithm.corpus.resetSentStats() if testCorpusPath: self.setPredictedTags(testCorpus) testEval = Evaluation(testCorpus) print "Testing evaluation for", i, "iteration(s):\n",testEval.format() testCorpus.resetSentStats() # !!! we can use prototype pattern(so we don't need to loop through sents): here testCorpus = testCorpus.getPrototype() and in Corpus::__init__ : self.prototype = self (google : python prototype)?
def main(): corpus = Corpus() translator = DirectTranslator() modified = ModifiedTranslator() testCorpus = corpus.fullCorpus() for pair in testCorpus: spanishSentence = pair[0] realEnglishSentence = pair[1] modifiedTranslatedSentence = modified.translateSentence(spanishSentence) translatedSentence = translator.translateSentence(spanishSentence) print "*** Translation ****" print "Spanish: " + spanishSentence print "Direct Translation: " + translatedSentence print "Modified Translation: " + modifiedTranslatedSentence print "Human Translation: " + realEnglishSentence
def setup_corpus(self, theme, nb_docs): #contient les mots filtrés self.WORDS = theme + ";" #les 3 métriques de centralités (pour chaque mot : dictionnaire) self.DEGCEN = {} self.CLOCEN = {} self.BETCEN = {} #le thème du corpus self.THEME = theme #nombre de documents du corpus self.NB_DOCS = nb_docs #le corpus self.corpus = Corpus(theme) self.corpus.download_collection(nb_docs, keyword=theme) self.A = self.corpus.get_adjacency_matrix()
def main(rootDirectory, words, confidenceLevel, functionType): corpus = Corpus(rootDirectory, toLowercase=TO_LOWERCASE, filters=FILTERS) #sampler = Sampler(SAMPLE_SIZE, sampleLength=SAMPLE_LENGTH) sampler = Sampler(SAMPLE_SIZE, sampleLengthPercentage=SAMPLE_LENGTH_PERCENTAGE) documentSamples = {} for documentTitle in corpus.documents: documentSample = sampler.sample(corpus.documents[documentTitle], usePercentage=True) documentSamples[documentTitle] = documentSample wordCounter = WordCounter(words, SAMPLE_SIZE) wordCounter.countOccurrences(documentSamples) dataLabels = sorted(list(wordCounter.occurrences.keys())) dataSets = [] for dataLabel in dataLabels: #dataSet = wordCounter.occurrences[dataLabel] dataSet = wordCounter.occurrencesPerMillionWords[dataLabel] dataSets.append(dataSet) statisticsPlotter = StatisticsPlotter(dataLabels, dataSets, CONFIDENCE, words) statisticsPlotter.plotStatistics(functionType=functionType)
def main(): """Sanity checks the edit model on the word 'hi'.""" trainPath = 'data/tagged-train.dat' trainingCorpus = Corpus(trainPath) editModel = EditModel("data/count_1edit.txt", trainingCorpus) #These are for testing, you can ignore them DELETE_EDITS = set(['Edit(editedWord=i, rule=<h|<)', 'Edit(editedWord=h, rule=hi|h)']) INSERT_EDITS = set([Edit('ahi','<','<a'),Edit('bhi','<','<b'),Edit('chi','<','<c'),Edit('dhi','<','<d'),Edit('ehi','<','<e'),Edit('fhi','<','<f'),Edit('ghi','<','<g'),Edit('hhi','<','<h'),Edit('ihi','<','<i'),Edit('jhi','<','<j'),Edit('khi','<','<k'),Edit('lhi','<','<l'),Edit('mhi','<','<m'),Edit('nhi','<','<n'),Edit('ohi','<','<o'),Edit('phi','<','<p'),Edit('qhi','<','<q'), Edit('rhi','<','<r'),Edit('shi','<','<s'),Edit('thi','<','<t'),Edit('uhi','<','<u'),Edit('vhi','<','<v'),Edit('whi','<','<w'),Edit('xhi','<','<x'),Edit('yhi','<','<y'),Edit('zhi','<','<z'),Edit('hai','h','ha'),Edit('hbi','h','hb'),Edit('hci','h','hc'),Edit('hdi','h','hd'),Edit('hei','h','he'),Edit('hfi','h','hf'),Edit('hgi','h','hg'),Edit('hhi','h','hh'), Edit('hii','h','hi'),Edit('hji','h','hj'),Edit('hki','h','hk'),Edit('hli','h','hl'),Edit('hmi','h','hm'),Edit('hni','h','hn'),Edit('hoi','h','ho'),Edit('hpi','h','hp'),Edit('hqi','h','hq'),Edit('hri','h','hr'),Edit('hsi','h','hs'),Edit('hti','h','ht'),Edit('hui','h','hu'),Edit('hvi','h','hv'),Edit('hwi','h','hw'),Edit('hxi','h','hx'),Edit('hyi','h','hy'),Edit('hzi','h','hz'), Edit('hia','i','ia'),Edit('hib','i','ib'),Edit('hic','i','ic'),Edit('hid','i','id'),Edit('hie','i','ie'),Edit('hif','i','if'),Edit('hig','i','ig'),Edit('hih','i','ih'),Edit('hii','i','ii'),Edit('hij','i','ij'),Edit('hik','i','ik'),Edit('hil','i','il'),Edit('him','i','im'),Edit('hin','i','in'),Edit('hio','i','io'),Edit('hip','i','ip'),Edit('hiq','i','iq'),Edit('hir','i','ir'), Edit('his','i','is'),Edit('hit','i','it'),Edit('hiu','i','iu'),Edit('hiv','i','iv'),Edit('hiw','i','iw'),Edit('hix','i','ix'),Edit('hiy','i','iy'),Edit('hiz','i','iz')]) TRANPOSE_EDITS = set([Edit('ih','hi','ih')]) REPLACE_EDITS = set([Edit('ai','h','a'),Edit('bi','h','b'),Edit('ci','h','c'),Edit('di','h','d'),Edit('ei','h','e'),Edit('fi','h','f'),Edit('gi','h','g'),Edit('ii','h','i'),Edit('ji','h','j'), Edit('ki','h','k'),Edit('li','h','l'),Edit('mi','h','m'),Edit('ni','h','n'),Edit('oi','h','o'),Edit('pi','h','p'),Edit('qi','h','q'),Edit('ri','h','r'),Edit('si','h','s'),Edit('ti','h','t'), Edit('ui','h','u'),Edit('vi','h','v'),Edit('wi','h','w'),Edit('xi','h','x'),Edit('yi','h','y'),Edit('zi','h','z'),Edit('ha','i','a'),Edit('hb','i','b'),Edit('hc','i','c'),Edit('hd','i','d'),Edit('he','i','e'),Edit('hf','i','f'),Edit('hg','i','g'),Edit('hh','i','h'),Edit('hj','i','j'), Edit('hk','i','k'),Edit('hl','i','l'),Edit('hm','i','m'),Edit('hn','i','n'),Edit('ho','i','o'),Edit('hp','i','p'),Edit('hq','i','q'),Edit('hr','i','r'),Edit('hs','i','s'),Edit('ht','i','t'), Edit('hu','i','u'),Edit('hv','i','v'),Edit('hw','i','w'),Edit('hx','i','x'),Edit('hy','i','y'),Edit('hz','i','z')]) print "***Code Sanity Check***" print "Delete edits for 'hi'" checkOverlap(set(editModel.deleteEdits('hi')), DELETE_EDITS) print "Insert edits for 'hi'" checkOverlap(set(editModel.insertEdits('hi')), INSERT_EDITS) print "Transpose edits for 'hi'" checkOverlap(set(editModel.transposeEdits('hi')), TRANPOSE_EDITS) print "Replace edits for 'hi'" checkOverlap(set(editModel.replaceEdits('hi')), REPLACE_EDITS)
def test_reverse_markov_dict(): full_markov_dict = MarkovDict(source=None, depth=1) corpus = Corpus(importStrFromFile("./test/corpora/reverse.txt")) full_markov_dict.add(corpus) bot = MarkovBot(full_markov_dict) print(bot.forward_dict.dict) print(bot.reverse_dict.dict)
def train(self, numIterations=100, testCorpusPath=None): if testCorpusPath: testCorpus = Corpus(testCorpusPath) for i in range(1, numIterations + 1): self.algorithm.train() # call train method from algorithm if i % 10 == 0: # trainEval = Evaluation(self.algorithm.corpus) # print "Training evaluation for", i, "iteration(s):\n", trainEval.format() # self.algorithm.corpus.resetSentStats() if testCorpusPath: self.setPredictedTags(testCorpus) testEval = Evaluation(testCorpus) print "Testing evaluation for", i, "iteration(s):\n", testEval.format( ) testCorpus.resetSentStats( ) # !!! we can use prototype pattern(so we don't need to loop through sents): here testCorpus = testCorpus.getPrototype() and in Corpus::__init__ : self.prototype = self (google : python prototype)?
def main(): """Trains all of the language models and tests them on the dev data. Change devPath if you wish to do things like test on the training data.""" # load training data trainPath = 'data/tagged-train.dat' trainingCorpus = Corpus(trainPath) # load dev data devPath = 'data/tagged-dev.dat' devCorpus = Corpus(devPath) print 'Unigram Language Model: ' unigramLM = UnigramModel(trainingCorpus) unigramSpell = SpellCorrect(unigramLM, trainingCorpus) unigramOutcome = unigramSpell.evaluate(devCorpus) print str(unigramOutcome) print 'Uniform Language Model: ' uniformLM = UniformModel(trainingCorpus) uniformSpell = SpellCorrect(uniformLM, trainingCorpus) uniformOutcome = uniformSpell.evaluate(devCorpus) print str(uniformOutcome) print 'Smooth Unigram Language Model: ' smoothUnigramLM = SmoothUnigramModel(trainingCorpus) smoothUnigramSpell = SpellCorrect(smoothUnigramLM, trainingCorpus) smoothUnigramOutcome = smoothUnigramSpell.evaluate(devCorpus) print str(smoothUnigramOutcome) print 'Smooth Bigram Language Model: ' smoothBigramLM = SmoothBigramModel(trainingCorpus) smoothBigramSpell = SpellCorrect(smoothBigramLM, trainingCorpus) smoothBigramOutcome = smoothBigramSpell.evaluate(devCorpus) print str(smoothBigramOutcome) print 'Backoff Language Model: ' backoffLM = BackoffModel(trainingCorpus) backoffSpell = SpellCorrect(backoffLM, trainingCorpus) backoffOutcome = backoffSpell.evaluate(devCorpus) print str(backoffOutcome) print 'Custom Language Model: ' customLM = CustomModel(trainingCorpus) customSpell = SpellCorrect(customLM, trainingCorpus) customOutcome = customSpell.evaluate(devCorpus) print str(customOutcome)
def main(): if (len(sys.argv) != 6): print "usage: python main.py <init_alpha> <modeldir_name> <num_topic> <data_file> <random/load>" sys.exit(1) init_alpha = float(sys.argv[1]) directory = sys.argv[2] num_topics = int(sys.argv[3]) data_file = sys.argv[4] start_type = sys.argv[5] # read_data corpus = Corpus() corpus.read_data(data_file) # Run LDA LdaEstimator.run_EM(init_alpha, directory, num_topics, corpus, start_type)
def main(): if len(sys.argv) != 3: print "Please provide paths to train and test corpora!" else: training_corpus = Corpus(sys.argv[1]) test_corpus = Corpus(sys.argv[2]) len_pos_train = len(training_corpus.generate_pos_pairs()) len_neg_train = len(training_corpus.generate_neg_pairs()) training_corpus.create_mallet_file("training_file_mallet.txt") len_test = len(test_corpus.generate_pos_pairs()) + len(test_corpus.generate_neg_pairs()) test_corpus.create_test_file("test_file_mallet.txt") print "There are " + str(len_pos_train) + " positive training instances and " + str(len_neg_train) + " negative training instances." print "There are " + str(len_test) + " test instances."
def __init__(self, clustered_corpus): self.corpora = [] for cluster in clustered_corpus: corpus = Corpus(cluster) self.corpora.append(corpus) if len(self.corpora) < 2: raise ValueError("clustered_corpus argument is not clustered") self.candidate_to_cu_mapping = self.calculate_cus_for()
def train(self,path_to_truth_dir): corpus = Corpus(path_to_truth_dir) #Read truth file truth = methods.read_classification_from_file(methods.add_slash(path_to_truth_dir)+"!truth.txt") #Make truth global self.truth = truth for fname, body in corpus.emails_as_string(): email_as_file = open(methods.add_slash(path_to_truth_dir) + fname,'r',encoding = 'utf-8') #Read email with EMAIL parser msg = email.message_from_file(email_as_file) self.extract_senders_list(msg,fname) self.check_subject(msg,fname) #Generate dict's methods.generate_file_from_dict(self.path_bl , self.black_list) methods.generate_file_from_dict(self.path_wl ,self.white_list) methods.generate_file_from_dict(self.path_ssl , self.spam_subject_list) methods.generate_file_from_dict(self.path_hsl ,self.ham_subject_list)
def generate_feature_csv(self, feature_csv, pos_lexicon, neg_lexicon, postag_instances=None): """ Generates a csv file with features extracted from instances according to data-driven DD model :param feature_csv: :param pos_lexicon: :param neg_lexicon: :param postag_instances: :return: """ if postag_instances: corpus_postag_set = Corpus.get_postag_set(postag_instances) # return all tags in corpus in a list else: corpus_postag_set = Corpus.get_postag_set(self.instances) # return all tags in corpus in a list # feature file header: ID, text, pos_feature, neg_feature, percentages for all corpus tags, label with open(feature_csv, 'wb') as f: wr = csv.writer(f) id = 1 wr.writerow(["ID", "text", "pos", "neg"]+corpus_postag_set+["label"]) for inst in self.instances: inst_postags = [token.get_tag() for token in inst.get_tokens()] inst_postag_counter = Counter(inst_postags) postag_percent = [] for tag in corpus_postag_set: if tag in inst_postag_counter: # percentage of words belonging to each POS in instance postag_percent.append(inst_postag_counter[tag]/inst.get_length()) else: postag_percent.append(0) # tokens_list = [token.get_text() for token in inst.get_tokens()] tokens_list = [token for token in inst.get_tokens()] # tokens as objects pos_neg_list = self.get_lexicon_features(tokens_list, pos_lexicon, neg_lexicon) # wr.writerow([id, inst.get_text(), pos_neg_list[0], pos_neg_list[1]]+postag_percent+[inst.get_label_gold()]) wr.writerow( [unicode(id).encode("utf-8"), unicode(inst.get_text()).encode("utf-8"), unicode(pos_neg_list[0]).encode("utf-8"), unicode(pos_neg_list[1]).encode("utf-8")] + postag_percent + [unicode(inst.get_label_gold()).encode("utf-8")]) id += 1 return feature_csv, corpus_postag_set
def test(): c1 = Corpus("Big round boulder. That is a round snake.") c2 = Corpus("The dog is fat. The dog eats food. My dog is yellow. Your cat is yellow.") c3 = Corpus("Look out! Look behind you. Are you there? Are you okay? To you, I defer.") m1 = MarkovDict(c1) m2 = MarkovDict(c2, 2) m3 = MarkovDict(c3) print ("m1:", m1.response()) print ("m1:", m1.response()) print ("m1:", m1.response()) print ("m2:", m2.response()) print ("m2:", m2.response()) print ("m2:", m2.response()) print ("m3:", m3.response()) print ("m3:", m3.response()) print ("m3:", m3.response())
def downloadCorpus(snapshotDir, corpusDir, projectName, configInfo): # 2. Dump the snapshots for a project msg = '---------------------------------------------------- \n' msg += ' Dump the corpus for project %s \n' % projectName msg += '---------------------------------------------------- \n' print(msg) project_snapshot_dir = os.path.join(snapshotDir, projectName) project_corpus_dir = os.path.join(corpusDir, projectName) if os.path.isdir(project_corpus_dir): print "!! %s already exists...returning \n" % project_corpus_dir #return corpus = Corpus(project_snapshot_dir, 'java', project_corpus_dir, configInfo) #logging.debug(corpus) #print corpus corpus.dump()
def main(): clustered_corpus_path = 'clustered_corpus' clustered_corpus = read_clustered_corpus(clustered_corpus_path) corpus = merge_clustered_corpus_into_a_single_corpus(clustered_corpus) target_file_path = 'target.txt' text = read_text_file(target_file_path) document = Document(text) corpus = Corpus(corpus) clustered_corpus = ClusteredCorpus(clustered_corpus) candidate_to_rank_mapping = {} candidate_to_params_mapping = {} candidate_to_dfs_in_each_cluster_mapping = {} for candidate in document.get_candidates(): tf = math.log(1.0 + document.get_tf_for(candidate), 10.0) # tf = document.get_tf_for(candidate) idf = math.log(1.0 + 1.0 / corpus.get_df_for(candidate), 2.0) cu = clustered_corpus.get_cu_for(candidate) rank = cu # rank = tf * cu # rank = tf * idf dfs_in_each_cluster = clustered_corpus.get_dfs_in_each_cluster_for(candidate) candidate_representative = corpus.get_representative_for(candidate) candidate_to_rank_mapping[candidate_representative] = rank candidate_to_params_mapping[candidate_representative] = (tf, idf, cu) candidate_to_dfs_in_each_cluster_mapping[candidate_representative] = dfs_in_each_cluster table = generate_table_based_on( candidate_to_rank_mapping, candidate_to_params_mapping, candidate_to_dfs_in_each_cluster_mapping ) save_as_file(table) print('Done.')
def __init__(self, text, keywords=None, remove_stopword=True, with_segs=False): self.text = text self.corpus = Corpus(text, keywords=keywords, remove_stopword=remove_stopword, with_segs=with_segs) self.network = nx.Graph() self.build_network()
def generate_combined_features(self, feature_csv): feature_rows = pd.read_csv(feature_csv) # Create vectorizer for function to use vectorizer = CountVectorizer(binary=True, ngram_range=(1, 2)) y = feature_rows["label"].values.astype(np.float32) X = sp.sparse.hstack( (vectorizer.fit_transform(feature_rows.text), feature_rows[['pos', 'neg'] + Corpus.get_postag_set(self.instances)].values), format='csr') return X, y, vectorizer
def main(): def get_data(): client = pymongo.MongoClient() db = client.twitter4 cursor = db.stream.aggregate([ {'$match': { 'date': { '$gt': datetime.datetime(2015, 11, 13) } }}, {'$sort': {'date': 1}}, {'$project': {'text': 1, 'date': 1}}, ]) return cursor def get_remote_data(): client = pymongo.MongoClient(host='59.77.134.176') db = client.twitter3 cursor = db.stream.aggregate([ # {'$sort': {'date': 1}}, {'$project': {'text': 1}}, ]) return cursor cursor = get_data() print 'calculate_entropy 多个词只算1次' olda = None reallen = 0 # for chunk_no, doc_chunk in enumerate(cursor_serial(cursor, 3000)): for chunk_no, doc_chunk in enumerate(chunkize_serial(cursor, 3000, as_numpy=False)): print doc_chunk[0]['date'] doc_chunk = [tweet['text'] for tweet in doc_chunk] reallen += len(doc_chunk) print chunk_no, reallen - len(doc_chunk), reallen, len(doc_chunk), 'lda' start = datetime.datetime.now() if not olda: corpus = Corpus(doc_chunk) olda = OnlineLDA(corpus, K=10) else: olda.fit(doc_chunk) # Give them to online LDA print datetime.datetime.now() - start with codecs.open(r'G:\test18.out', "w", "utf-8-sig") as f: for topic_id, (topic_likelihood, topic_words, topic_tweets) in olda.get_lda_info(): print '{}%\t{}'.format(round(topic_likelihood * 100, 2), topic_words) print '\t', topic_tweets f.write(topic_tweets + '\n') print '\n\n\n\n\n\n'
def createWordLookup(self, foreignSentence): corpus = Corpus() tokenDictList = [] """Captures only words, no spaces/punctuation""" spanishTokens = re.compile('(\W+)', re.UNICODE).split(unicode(foreignSentence, 'utf-8')) spanishTokens.pop() for idx, token in enumerate(spanishTokens): tokenDict = dict() tokenDict['originalToken'] = token tokenDict['spanish_POS'] = corpus.spanishTags().get(token, None) if (len(token) > 0): if token[0].isupper(): tokenDict['upper'] = True else: tokenDict['upper'] = False else: tokenDict['upper'] = False tokenDictList.append(tokenDict) self.tokenDictList = tokenDictList
def __init__(self, text, keywords=None, remove_stopword=True, with_segs=False, weight_type='count'): self.text = text self.corpus = Corpus(text, keywords=keywords, remove_stopword=remove_stopword, with_segs=with_segs) self.network = nx.Graph() self._network(weight_type)
def generate_feature_csv(self, feature_csv, pos_lexicon, neg_lexicon, postag_instances=None): if postag_instances: corpus_postag_set = Corpus.get_postag_set( postag_instances) # return all tags in corpus in a list else: corpus_postag_set = Corpus.get_postag_set( self.instances) # return all tags in corpus in a list # ID, text, pos_feature, neg_feature, percentages for all corpus tags, label with open(feature_csv, 'wb') as f: # wr = csv.writer(f, quoting=csv.QUOTE_ALL) wr = csv.writer(f) id = 1 wr.writerow(["ID", "text", "pos", "neg"] + corpus_postag_set + ["label"]) for inst in self.instances: inst_postags = [token.get_tag() for token in inst.get_tokens()] inst_postag_counter = Counter(inst_postags) postag_percent = [] for tag in corpus_postag_set: if tag in inst_postag_counter: # percentage of words belonging to each POS in instance postag_percent.append(inst_postag_counter[tag] / inst.get_length()) else: postag_percent.append(0) pos_neg_list = self.get_lexicon_features( inst.get_text(), pos_lexicon, neg_lexicon) wr.writerow( [id, inst.get_text(), pos_neg_list[0], pos_neg_list[1]] + postag_percent + [inst.get_label_gold()]) id += 1 return feature_csv, corpus_postag_set
def create_corpus(): corpus = Corpus() for folder in glob.iglob('texts/*'): for filename in glob.iglob(folder + "/*"): corpus.add_document(Document(filename)) # corpus.add_document(Document(folder)) corpus.build_vocabulary() return corpus
def test(self, path_to_test_dir): predictions = {} #Predictions dict {fname:prediction} bs = Bayesian.Bayesian() corpus = Corpus(path_to_test_dir) #Read dict's (if test called before train) black_list_dict = methods.read_dict_from_file(self.path_bl) white_list_dict = methods.read_dict_from_file(self.path_wl) spam_subject_dict = methods.read_dict_from_file(self.path_ssl) ham_subject_dict = methods.read_dict_from_file(self.path_hsl) for fname, body in corpus.emails_as_string(): #Open email with parser email_as_file = open(methods.add_slash(path_to_test_dir) + fname,'r',encoding = 'utf-8') msg = email.message_from_file(email_as_file) #Check if sender in a black list if (self.extract_email_adress_from_text(msg['From']) in black_list_dict): predictions[fname] = 'SPAM' elif(self.extract_email_adress_from_text(msg['From']) in white_list_dict): #Check if sender in a white list predictions[fname] = 'OK' #Check if subject in a black list elif(self.extract_email_adress_from_text(msg['From']) in spam_subject_dict): prediction[fname] = 'SPAM' #Check if subject in a white list elif(self.extract_email_adress_from_text(msg['From']) in ham_subject_dict): prediction[fname] = 'OK' #Run Bayesian checker else: if (bs.bayesian_prediction(methods.get_text(msg))) > 0.485: predictions[fname] = 'SPAM' else: predictions[fname] = 'OK' #Generate prediction file bf = BaseFilter(path_to_test_dir,predictions) bf.generate_prediction_file()
def get_description(self): ''' This function gets all the url, finds their description text and update them to the database ''' #get doc_id self.mycursor.execute("select id,url from doc") myresult = self.mycursor.fetchall() for doc_id, url in myresult: #print("**********Doc ID is "+str(doc_id)+" ********") c = Corpus() name = c.url_to_dir(url) #print("Name is "+ name) with open(name, "rb") as file: content = file.read() soup = BeautifulSoup(content, "lxml") metas = soup.find_all("meta") result = '' for meta in metas: if ('content' in meta.attrs) and ('name' in meta.attrs) and \ ((meta.attrs['name'] == 'description') or (meta.attrs['name'] == 'keywords')): result = " ".join(meta.attrs['content'].split()) #if html doesn't have description tag if result == '': script = soup.find( ["h1", "h2", "h3", "h4", "h5", "strong", "title", "b"]) if script: temp = " ".join(script.text.split()) result += temp if len(temp) < 200 else "" print(result) i_sql = "update doc set description =%s where id = %s" i_val = (result, doc_id) self.mycursor.execute(i_sql, i_val) self.mydb.commit() print(self.mycursor.rowcount, "was inserted in DOC , DOC ID IS " + str(doc_id))
def __init__(self): self.emotion = { 'surprise': 0, 'anger': 1, 'happy': 2, 'love': 3, 'fear': 4, 'trust': 5, 'disgust': 6, 'sad': 7 } self.tp = 0 self.tn = 0 self.fp = 0 self.fn = 0 self.accuracy = 0 self.precision = 0 self.recall = 0 self.F1 = 0 # self.list_gold = list_gold # self.list_prediction = list_prediction cp = Corpus() self.list_gold = cp.read_label() self.list_prediction = cp.read_prediction()
def test(): print("Creating new markov dict...") print(getDepth()) corpus_path = "./corpora/test/depth.txt" corpus = Corpus(importStrFromFile(corpus_path)) print(corpus) reverse_corpus = list(reversed(corpus)) print(reverse_corpus) forward_markov_dict = MarkovDict(source=corpus, depth=getDepth()) reverse_markov_dict = MarkovDict(source=reverse_corpus, depth=getDepth()) pprint(forward_markov_dict.dict) pprint(reverse_markov_dict.dict) bot = MarkovBot(forward_markov_dict, reverse_markov_dict) # pprint(bot.forward_dict.dict) # pprint(bot.reverse_dict.dict) print(bot.response(topic="markov"))
def main(): print("Creating new markov dict...") forward_markov_dict = MarkovDict(source=None, depth=getDepth()) reverse_markov_dict = MarkovDict(source=None, depth=getDepth()) print("Starting for loop to add corpora...") for corpus_path in corporaPaths(): corpus = Corpus(importStrFromFile(corpus_path)) print("Adding corpus with path '" + corpus_path + "'...") forward_markov_dict.add(corpus) reverse_markov_dict.add(list(reversed(corpus))) print("Initializing MarkovBot...") bot = MarkovBot(forward_markov_dict, reverse_markov_dict) print("\nWelcome to MarkovBot! Type a message. Type 'exit()' to quit.") message = prompt() while message != "exit()": print(bot.response(topic=message.split()[0])) message = prompt()
def __init__(self, data_path, corpus_file): """ WorbEmb class init Parameters ---------- data_path : str data full path corpus_file : str protein domain corpus file name Returns ------- None """ self.data_path = data_path self.corpus_file = corpus_file self.Corpus = Corpus(self.data_path, self.corpus_file) self.w2v_model = "none" self.w2v_file_out = ""
class Searcher: def __init__(self, raw_documents): self.corpus = Corpus(raw_documents) self.metrics = Metrics(self.corpus) def search(self, query): results = [] query_document = Document(query) query_stems = query_document.get_stems() documents = self.corpus.get_documents() for doc in documents: document_id = doc.get_id() score = 0.0 stemmed_document = doc.get_stems() for qstem in query_stems: if qstem in stemmed_document: term_frequency = self.metrics.get_term_frequency(document_id, qstem) score += term_frequency if score > 0.0: results.append({"id": doc.get_id(), "score": score, "text": doc.get_text()}) return results def get_corpus(self): return self.corpus
def __init__(self, raw_documents): self.corpus = Corpus(raw_documents) self.metrics = Metrics(self.corpus)
else: termino = load_terminology(config.termino_path) print "\nTerminology loaded" print "\nLoading %i files"%nb_files print "-------------------------------------------" docs = [] # Load the files for i, f in enumerate(files): sys.stdout.write( "\r%3i/%i %s"%( i+1, nb_files, '{:<70}'.format(f) ) ) sys.stdout.flush() docs.append(Document(f)) corpus = Corpus(docs, termino) print "\n\nCorpus preprocessing" print "-------------------------------------------" corpus.preprocess() print "\n\nExtracting the keywords" print "-------------------------------------------" corpus.process() if Config().testing: print "\n\nResults (%s average)"%("Macro" if config.macro_average else "Micro") print "-------------------------------------------" corpus.results() else: print "\n"
# -*- encoding: utf-8 -*- # -*- coding: utf-8 -*- import sys import extractor from Corpus import Corpus if __name__ == "__main__": dbname, collname = sys.argv[1], sys.argv[2] corpus_db = Corpus(database=dbname, collection=collname) df_dbname, df_collname = dbname, sys.argv[3] df = {} for j, item in enumerate(corpus_db.find({})): for word in set( extractor.getwords(item["text"]) ): extractor.countup(df, word) with file(df_collname, "w") as opened: for word, freq in sorted(df.items(), key=lambda x:x[1], reverse=True): opened.write("%s\t%d\n" % (word, freq))
parser = argparse.ArgumentParser(description="says") parser.add_argument("-d", "--database", default="says") parser.add_argument("-i", "--items", default="items") parser.add_argument("-s", "--itemstats", default="itemstats") args = parser.parse_args() return args if __name__ == "__main__": args = parse_args() db = Corpus(database=args.database, collection=args.items) db_stats = Corpus(database=args.database, collection=args.itemstats) try: latstats = db_stats.findsorted({}, key="id")[0]["id"] except IndexError: latstats = 0L for i, item in enumerate(db.find({ "id": { "$gt": latstats }})): words = extractd.getwords(item) messages = extractd.getmessages(item) tags = extractd.gethashtags(item) urls = extractd.geturls(item) db_stats.append({
uid = extractd.getid(n2i, u) vid = extractd.getid(n2i, v) graph.add_edge(uid, vid) extractd.countup(weights, (uid, vid)) extractd.countup(weights, (vid, uid)) with file('%s.wpairs' % sys.argv[1], 'w') as opened: for e in graph.edges(): w = weights[(e[0], e[1])] if weights[(e[0], e[1])] <= weights[(e[1], e[0])] else weights[(e[1], e[0])] opened.write( '%d\t%d\t%d\n' % (e[0], e[1], w) ) with file('%s.n2i' % sys.argv[1], 'w') as opened: for u in n2i: opened.write('%s\t%d\n' % (u, n2i[u])) if __name__ == '__main__': dbinfo = Pit.get("says") db = Corpus(database=dbinfo["db"], collection=dbinfo["items"]) t_end = time.mktime( datetime.today().timetuple() ) t_begin = t_end - (24 * 60 * 60 * 10) items = [ item for item in db.find({'created_at': { '$gt': t_begin, '$lt': t_end }}) ] make_graph(items)
def parse_args(): usage = "[--interval] [interval] [-l] [path-to-log]" parser = argparse.ArgumentParser(description="says") parser.add_argument("--interval", type=float, default=1.0) parser.add_argument("-l", "--log", default=".log/log") args = parser.parse_args() return args if __name__ == "__main__": args = parse_args() dbinfo = Pit.get("says") users_db = Corpus(database=dbinfo["db"], collection=dbinfo["users"]) #users = users_db.find({}) users = [ item["screen_name"] for item in users_db.find({}) ] api = activate_api() items_db = Corpus(database=dbinfo["db"], collection=dbinfo["items"]) getitems(users, api, items_db)
def shell(filelimit = 0): #rootpath = "/home/dicle/Dicle/Tez/dataset/readingtest30/" corpuspath = "/home/dicle/Dicle/Tez/dataset/readingtest300/" rootpath = corpuspath folders = IOtools.getfoldernames_of_dir(corpuspath) foldername = "" corpus = Corpus(rootpath) singlefolder = False if len(folders) == 0: singlefolder = True if singlefolder: rootpath = corpuspath #corpus = Corpus(rootpath, foldername) starttime = datetime.now() buildcorpus(corpus, rootpath, filelimit) endtime_buildcorpus = datetime.now() print "build corpus took: ",str(endtime_buildcorpus - starttime) print "corpus length ",str(len(corpus.words))," words" else: for foldername in folders: print "Folder: ",foldername rootpath = corpuspath + os.sep + foldername + os.sep #corpus = Corpus(rootpath, foldername) starttime = datetime.now() buildcorpus(corpus, rootpath) endtime_buildcorpus = datetime.now() print "build corpus took: ",str(endtime_buildcorpus - starttime) print "corpus length ",str(len(corpus.words))," words" print "pickle-getting words" corpus.picklegetwords() print "assigning pos tags" assignPOStags(corpus) endtime_postags = datetime.now() print "postag assignment took: ",str(endtime_postags - endtime_buildcorpus) ''' get_magnitudewords_doc_matrix(corpus) adjectives = get_words_ofPOStag(corpus, "ADJ") print "numof adjectives, ",len(adjectives)," ",adjectives[:-10] get_docterm_matrix(corpus, adjectives, "adjective-doc-matrix.txt", record = True) ''' endtime = datetime.now() passtime = endtime - starttime print "Elapsed time: ",passtime," on folder ",foldername print "pickle-dumping words" endtimep = datetime.now() corpus.pickledumpwords() print "Corpus length: ",len(corpus.words) print "Elapsed time for pickle: ",str(endtimep - endtime) # PICKLE words print "pickle-getting words" corpus.picklegetwords() print "corpus first 20 words:" for word in corpus.words[:20]: word.toscreen() print "pickle-dumping words" corpus.pickledumpwords()
self.num_word = corpus.getWordNum() self.pz_w = self._rand_mat(self.num_topic, self.num_word) print self.pz_w def fit(self): """ 训练pLSA [ [token1, token2], [token1, token2, token3, token4] ] :return: """ pass def transform(self): # no use pass def _rand_mat(self, sizex, sizey): mat = np.random.rand(sizex, sizey) for r in range(sizex): s = np.sum(mat[r]) for c in range(sizey): mat[r][c] = mat[r][c] / s return mat if __name__ == "__main__": corpus = Corpus("../data/topic/corpus") print corpus.getVocab() plsa = pLSA(corpus, 5)
import time start_time = time.time() #Chemin pour l'accès aux fichiers constituants le corpus path = 'docs/' #On regarde tous les documents dans le dossier indiqué corpus = os.listdir(path) #On met les chemins complets (au lieu des noms des documents) for i in range(0, len(corpus)): corpus[i] = path+corpus[i] #On créé le corpus corpus = Corpus(corpus, 'stopwords.txt', 'dico/') cheminRequete = "requete" requeteFile = open(cheminRequete, 'w', -1, 'utf-8') for i in range(1, len(sys.argv)): requeteFile.write(sys.argv[i]+" ") requeteFile.close() corpus.lemmatiserCorpus(cheminRequete) corpus.vectoriserDocCorpus() #On prépare une variable pour le calcul de similarité finale = {} finale = corpus.calculSimilarite() for i in range(0, len(finale)):
from Corpus import Corpus from Rule import PossibleRules print("Analysis starting...") train_corpus = Corpus(["./dataset/TrainingSet/file1.txt" , "./dataset/TrainingSet/file2.txt" , "./dataset/TrainingSet/file3.txt" , "./dataset/TrainingSet/file4.txt" , "./dataset/TrainingSet/file5.txt" , "./dataset/TrainingSet/file6.txt" , "./dataset/TrainingSet/file7.txt" , "./dataset/TrainingSet/file8.txt" ]) train_corpus.outputWords("./Output/MostLikelyMorphParseForWord.txt") print("Most likely morphological parses for words are written to ./Output/MostLikelyMorphParseForWord.txt") train_corpus.outputPOStags("./Output/MostLikelyTag.txt") print("Most likely tags are written to ./Output/MostLikelyTag.txt") train_corpus.tag_words_with_most_likely_parses() tag_order = 1 print("TRAIN: Precision for DS" + str(tag_order) + " " + str(train_corpus.calculate_precision())) print("Possible rules are generating...") rules = PossibleRules(train_corpus.tags[:20]).rules # just try first 20 words in the training corpus since it is expensive to walk through all the words print(str(len(train_corpus.all_words_in_corpus)) + " words in training set.") learned_rules_with_precision = [] for rule in rules: