def main(): """Trains all of the language models and tests them on the dev data. Change devPath if you wish to do things like test on the training data.""" trainPath = 'data/tagged-train.dat' trainingCorpus = Corpus(trainPath) devPath = 'data/tagged-dev.dat' devCorpus = Corpus(devPath) # print 'Unigram Language Model: ' # unigramLM = UnigramModel(trainingCorpus) # unigramSpell = SpellCorrect(unigramLM, trainingCorpus) # unigramOutcome = unigramSpell.evaluate(devCorpus) # print str(unigramOutcome) # print 'Uniform Language Model: ' # uniformLM = UniformModel(trainingCorpus) # uniformSpell = SpellCorrect(uniformLM, trainingCorpus) # uniformOutcome = uniformSpell.evaluate(devCorpus) # print str(uniformOutcome) # print 'Smooth Unigram Language Model: ' # smoothUnigramLM = SmoothUnigramModel(trainingCorpus) # smoothUnigramSpell = SpellCorrect(smoothUnigramLM, trainingCorpus) # smoothUnigramOutcome = smoothUnigramSpell.evaluate(devCorpus) # print str(smoothUnigramOutcome) print 'Smooth Bigram Language Model: ' smoothBigramLM = SmoothBigramModel(trainingCorpus) smoothBigramSpell = SpellCorrect(smoothBigramLM, trainingCorpus) smoothBigramOutcome = smoothBigramSpell.evaluate(devCorpus) print str(smoothBigramOutcome)
def main(rootDirectory, words, confidenceLevel, functionType): corpus = Corpus(rootDirectory, toLowercase=TO_LOWERCASE, filters=FILTERS) #sampler = Sampler(SAMPLE_SIZE, sampleLength=SAMPLE_LENGTH) sampler = Sampler(SAMPLE_SIZE, sampleLengthPercentage=SAMPLE_LENGTH_PERCENTAGE) documentSamples = {} for documentTitle in corpus.documents: documentSample = sampler.sample(corpus.documents[documentTitle], usePercentage=True) documentSamples[documentTitle] = documentSample wordCounter = WordCounter(words, SAMPLE_SIZE) wordCounter.countOccurrences(documentSamples) dataLabels = sorted(list(wordCounter.occurrences.keys())) dataSets = [] for dataLabel in dataLabels: #dataSet = wordCounter.occurrences[dataLabel] dataSet = wordCounter.occurrencesPerMillionWords[dataLabel] dataSets.append(dataSet) statisticsPlotter = StatisticsPlotter(dataLabels, dataSets, CONFIDENCE, words) statisticsPlotter.plotStatistics(functionType=functionType)
def searchengine(directory): stopWords = set(stopwords.words("english")) # stemming ps = PorterStemmer() # create InvertedIndex obj invertedIndex = InvertedIndex() # build the corpus Corp = Corpus() corpus = Corp.buildCorpus(directory) for docId in corpus: doc = corpus[docId] content = doc.getContent() # tokenize tokens = word_tokenize(content) for token in tokens: token = token.lower() # apply stemming token = ps.stem(token) # remove stopwords if token in stopWords: continue # add to index invertedIndex.addTerm(token, docId) return invertedIndex, corpus
def test_reverse_markov_dict(): full_markov_dict = MarkovDict(source=None, depth=1) corpus = Corpus(importStrFromFile("./test/corpora/reverse.txt")) full_markov_dict.add(corpus) bot = MarkovBot(full_markov_dict) print(bot.forward_dict.dict) print(bot.reverse_dict.dict)
def build_index(self): ''' This function build the inverted index, it inserts the url to the doc Table with a doc_id, and insert each token to tokenT table and insert token, doc_id, term frequency and weight into the web_index Table ''' c = Corpus() t = Tokenizer() for url, name in c.get_file_name(): if len(url) > 1000: continue result = t.tokenize(name) if len(result) == 0: continue print(url) doc_id = 1 #Insert URL to table DOC sql = "INSERT INTO web.doc(url) values (%s)" val = (url, ) self.mycursor.execute(sql, val) self.mydb.commit() print(self.mycursor.rowcount, "was inserted in URL.") print(url) s_sql = "select id from doc where url=%s" self.mycursor.execute(s_sql, val) myresult = self.mycursor.fetchone() doc_id = myresult[0] print("DOC_ID IS " + str(doc_id)) #Insert token, doc_id, tf into web_index t_sql = "INSERT INTO web.web_index(token, doc_id, tf, wt) values (%s,%s,%s,%s)" t_val = [] for token in result.keys(): t_val.append( (token, doc_id, result[token][0], result[token][1])) #print(t_val) self.mycursor.executemany(t_sql, t_val) self.mydb.commit() print(self.mycursor.rowcount, "was inserted in WEB_INDEX.") #insert into TokenT table count = 0 for token in result.keys(): tq = "Insert ignore into tokenT values (%s)" tv = (token, ) self.mycursor.execute(tq, tv) self.mydb.commit() count += 1 print("inserted " + str(count) + " Tokens")
def main(): """Sanity checks the edit model on the word 'hi'.""" trainPath = 'data/tagged-train.dat' trainingCorpus = Corpus(trainPath) editModel = EditModel("data/count_1edit.txt", trainingCorpus) #These are for testing, you can ignore them DELETE_EDITS = set(['Edit(editedWord=i, rule=<h|<)', 'Edit(editedWord=h, rule=hi|h)']) INSERT_EDITS = set([Edit('ahi','<','<a'),Edit('bhi','<','<b'),Edit('chi','<','<c'),Edit('dhi','<','<d'),Edit('ehi','<','<e'),Edit('fhi','<','<f'),Edit('ghi','<','<g'),Edit('hhi','<','<h'),Edit('ihi','<','<i'),Edit('jhi','<','<j'),Edit('khi','<','<k'),Edit('lhi','<','<l'),Edit('mhi','<','<m'),Edit('nhi','<','<n'),Edit('ohi','<','<o'),Edit('phi','<','<p'),Edit('qhi','<','<q'), Edit('rhi','<','<r'),Edit('shi','<','<s'),Edit('thi','<','<t'),Edit('uhi','<','<u'),Edit('vhi','<','<v'),Edit('whi','<','<w'),Edit('xhi','<','<x'),Edit('yhi','<','<y'),Edit('zhi','<','<z'),Edit('hai','h','ha'),Edit('hbi','h','hb'),Edit('hci','h','hc'),Edit('hdi','h','hd'),Edit('hei','h','he'),Edit('hfi','h','hf'),Edit('hgi','h','hg'),Edit('hhi','h','hh'), Edit('hii','h','hi'),Edit('hji','h','hj'),Edit('hki','h','hk'),Edit('hli','h','hl'),Edit('hmi','h','hm'),Edit('hni','h','hn'),Edit('hoi','h','ho'),Edit('hpi','h','hp'),Edit('hqi','h','hq'),Edit('hri','h','hr'),Edit('hsi','h','hs'),Edit('hti','h','ht'),Edit('hui','h','hu'),Edit('hvi','h','hv'),Edit('hwi','h','hw'),Edit('hxi','h','hx'),Edit('hyi','h','hy'),Edit('hzi','h','hz'), Edit('hia','i','ia'),Edit('hib','i','ib'),Edit('hic','i','ic'),Edit('hid','i','id'),Edit('hie','i','ie'),Edit('hif','i','if'),Edit('hig','i','ig'),Edit('hih','i','ih'),Edit('hii','i','ii'),Edit('hij','i','ij'),Edit('hik','i','ik'),Edit('hil','i','il'),Edit('him','i','im'),Edit('hin','i','in'),Edit('hio','i','io'),Edit('hip','i','ip'),Edit('hiq','i','iq'),Edit('hir','i','ir'), Edit('his','i','is'),Edit('hit','i','it'),Edit('hiu','i','iu'),Edit('hiv','i','iv'),Edit('hiw','i','iw'),Edit('hix','i','ix'),Edit('hiy','i','iy'),Edit('hiz','i','iz')]) TRANPOSE_EDITS = set([Edit('ih','hi','ih')]) REPLACE_EDITS = set([Edit('ai','h','a'),Edit('bi','h','b'),Edit('ci','h','c'),Edit('di','h','d'),Edit('ei','h','e'),Edit('fi','h','f'),Edit('gi','h','g'),Edit('ii','h','i'),Edit('ji','h','j'), Edit('ki','h','k'),Edit('li','h','l'),Edit('mi','h','m'),Edit('ni','h','n'),Edit('oi','h','o'),Edit('pi','h','p'),Edit('qi','h','q'),Edit('ri','h','r'),Edit('si','h','s'),Edit('ti','h','t'), Edit('ui','h','u'),Edit('vi','h','v'),Edit('wi','h','w'),Edit('xi','h','x'),Edit('yi','h','y'),Edit('zi','h','z'),Edit('ha','i','a'),Edit('hb','i','b'),Edit('hc','i','c'),Edit('hd','i','d'),Edit('he','i','e'),Edit('hf','i','f'),Edit('hg','i','g'),Edit('hh','i','h'),Edit('hj','i','j'), Edit('hk','i','k'),Edit('hl','i','l'),Edit('hm','i','m'),Edit('hn','i','n'),Edit('ho','i','o'),Edit('hp','i','p'),Edit('hq','i','q'),Edit('hr','i','r'),Edit('hs','i','s'),Edit('ht','i','t'), Edit('hu','i','u'),Edit('hv','i','v'),Edit('hw','i','w'),Edit('hx','i','x'),Edit('hy','i','y'),Edit('hz','i','z')]) print "***Code Sanity Check***" print "Delete edits for 'hi'" checkOverlap(set(editModel.deleteEdits('hi')), DELETE_EDITS) print "Insert edits for 'hi'" checkOverlap(set(editModel.insertEdits('hi')), INSERT_EDITS) print "Transpose edits for 'hi'" checkOverlap(set(editModel.transposeEdits('hi')), TRANPOSE_EDITS) print "Replace edits for 'hi'" checkOverlap(set(editModel.replaceEdits('hi')), REPLACE_EDITS)
def main(): """Trains all of the language models and tests them on the dev data. Change devPath if you wish to do things like test on the training data.""" # load training data trainPath = 'data/tagged-train.dat' trainingCorpus = Corpus(trainPath) # load dev data devPath = 'data/tagged-dev.dat' devCorpus = Corpus(devPath) print 'Unigram Language Model: ' unigramLM = UnigramModel(trainingCorpus) unigramSpell = SpellCorrect(unigramLM, trainingCorpus) unigramOutcome = unigramSpell.evaluate(devCorpus) print str(unigramOutcome) print 'Uniform Language Model: ' uniformLM = UniformModel(trainingCorpus) uniformSpell = SpellCorrect(uniformLM, trainingCorpus) uniformOutcome = uniformSpell.evaluate(devCorpus) print str(uniformOutcome) print 'Smooth Unigram Language Model: ' smoothUnigramLM = SmoothUnigramModel(trainingCorpus) smoothUnigramSpell = SpellCorrect(smoothUnigramLM, trainingCorpus) smoothUnigramOutcome = smoothUnigramSpell.evaluate(devCorpus) print str(smoothUnigramOutcome) print 'Smooth Bigram Language Model: ' smoothBigramLM = SmoothBigramModel(trainingCorpus) smoothBigramSpell = SpellCorrect(smoothBigramLM, trainingCorpus) smoothBigramOutcome = smoothBigramSpell.evaluate(devCorpus) print str(smoothBigramOutcome) print 'Backoff Language Model: ' backoffLM = BackoffModel(trainingCorpus) backoffSpell = SpellCorrect(backoffLM, trainingCorpus) backoffOutcome = backoffSpell.evaluate(devCorpus) print str(backoffOutcome) print 'Custom Language Model: ' customLM = CustomModel(trainingCorpus) customSpell = SpellCorrect(customLM, trainingCorpus) customOutcome = customSpell.evaluate(devCorpus) print str(customOutcome)
def verificarPlagioTimeProfile(diretorioCorpus, diretorioDocumento, limiar): ''' Input: endereço para pasta contendo arquivos, endereço para um arquivo, limiar de contenção para verificação. Conta o tempo de instanciação de um objeto corpus, um objeto documento, e da verificação de plágio do documento em relação ao corpus. ''' c = Corpus(diretorioCorpus) doc = Documento(diretorioDocumento) return c.verificarPlagio(doc, limiar)
def __init__(self, clustered_corpus): self.corpora = [] for cluster in clustered_corpus: corpus = Corpus(cluster) self.corpora.append(corpus) if len(self.corpora) < 2: raise ValueError("clustered_corpus argument is not clustered") self.candidate_to_cu_mapping = self.calculate_cus_for()
def create_corpus(): corpus = Corpus() for folder in glob.iglob('texts/*'): for filename in glob.iglob(folder + "/*"): corpus.add_document(Document(filename)) # corpus.add_document(Document(folder)) corpus.build_vocabulary() return corpus
def verificarPlagioMemUsageProfile(diretorioCorpus, diretorioDocumento, limiar): ''' Input: endereço para pasta contendo arquivos, endereço para um arquivo, limiar de contenção para verificação. Calcula a memória usada pela instanciação de um objeto corpus, um objeto documento, e da verificação de plágio do documento em relação ao corpus. ''' c = Corpus(diretorioCorpus) doc = Documento(diretorioDocumento) return c.verificarPlagio(doc, limiar)
def __init__(self, verbose=False): print('Loading corpus ...') self.corpus = Corpus(verbose=verbose) self.corpus.create_data() self.X_seqs, self.y_seqs = self.corpus.X_seqs, self.corpus.y_seqs self.seq_count = len(self.X_seqs) for seq_idx in range(self.seq_count): assert (len(self.X_seqs[seq_idx]) == len(self.y_seqs[seq_idx])) self.feature_dim = len(self.X_seqs[0][0])
def main(): if len(sys.argv) != 3: print "Please provide paths to train and test corpora!" else: training_corpus = Corpus(sys.argv[1]) test_corpus = Corpus(sys.argv[2]) len_pos_train = len(training_corpus.generate_pos_pairs()) len_neg_train = len(training_corpus.generate_neg_pairs()) training_corpus.create_mallet_file("training_file_mallet.txt") len_test = len(test_corpus.generate_pos_pairs()) + len( test_corpus.generate_neg_pairs()) test_corpus.create_test_file("test_file_mallet.txt") print "There are " + str( len_pos_train) + " positive training instances and " + str( len_neg_train) + " negative training instances." print "There are " + str(len_test) + " test instances."
def test(): c1 = Corpus("Big round boulder. That is a round snake.") c2 = Corpus("The dog is fat. The dog eats food. My dog is yellow. Your cat is yellow.") c3 = Corpus("Look out! Look behind you. Are you there? Are you okay? To you, I defer.") m1 = MarkovDict(c1) m2 = MarkovDict(c2, 2) m3 = MarkovDict(c3) print ("m1:", m1.response()) print ("m1:", m1.response()) print ("m1:", m1.response()) print ("m2:", m2.response()) print ("m2:", m2.response()) print ("m2:", m2.response()) print ("m3:", m3.response()) print ("m3:", m3.response()) print ("m3:", m3.response())
def __init__(self, text, keywords=None, remove_stopword=True, with_segs=False): self.text = text self.corpus = Corpus(text, keywords=keywords, remove_stopword=remove_stopword, with_segs=with_segs) self.network = nx.Graph() self.build_network()
def main(): def get_data(): client = pymongo.MongoClient() db = client.twitter4 cursor = db.stream.aggregate([ {'$match': { 'date': { '$gt': datetime.datetime(2015, 11, 13) } }}, {'$sort': {'date': 1}}, {'$project': {'text': 1, 'date': 1}}, ]) return cursor def get_remote_data(): client = pymongo.MongoClient(host='59.77.134.176') db = client.twitter3 cursor = db.stream.aggregate([ # {'$sort': {'date': 1}}, {'$project': {'text': 1}}, ]) return cursor cursor = get_data() print 'calculate_entropy 多个词只算1次' olda = None reallen = 0 # for chunk_no, doc_chunk in enumerate(cursor_serial(cursor, 3000)): for chunk_no, doc_chunk in enumerate(chunkize_serial(cursor, 3000, as_numpy=False)): print doc_chunk[0]['date'] doc_chunk = [tweet['text'] for tweet in doc_chunk] reallen += len(doc_chunk) print chunk_no, reallen - len(doc_chunk), reallen, len(doc_chunk), 'lda' start = datetime.datetime.now() if not olda: corpus = Corpus(doc_chunk) olda = OnlineLDA(corpus, K=10) else: olda.fit(doc_chunk) # Give them to online LDA print datetime.datetime.now() - start with codecs.open(r'G:\test18.out', "w", "utf-8-sig") as f: for topic_id, (topic_likelihood, topic_words, topic_tweets) in olda.get_lda_info(): print '{}%\t{}'.format(round(topic_likelihood * 100, 2), topic_words) print '\t', topic_tweets f.write(topic_tweets + '\n') print '\n\n\n\n\n\n'
def __init__(self, text, keywords=None, remove_stopword=True, with_segs=False, weight_type='count'): self.text = text self.corpus = Corpus(text, keywords=keywords, remove_stopword=remove_stopword, with_segs=with_segs) self.network = nx.Graph() self._network(weight_type)
def setup_corpus(self, theme, nb_docs): #contient les mots filtrés self.WORDS = theme + ";" #les 3 métriques de centralités (pour chaque mot : dictionnaire) self.DEGCEN = {} self.CLOCEN = {} self.BETCEN = {} #le thème du corpus self.THEME = theme #nombre de documents du corpus self.NB_DOCS = nb_docs #le corpus self.corpus = Corpus(theme) self.corpus.download_collection(nb_docs, keyword=theme) self.A = self.corpus.get_adjacency_matrix()
def train(self, numIterations=100, testCorpusPath=None): if testCorpusPath: testCorpus = Corpus(testCorpusPath) for i in range(1, numIterations + 1): self.algorithm.train() # call train method from algorithm if i % 10 == 0: # trainEval = Evaluation(self.algorithm.corpus) # print "Training evaluation for", i, "iteration(s):\n", trainEval.format() # self.algorithm.corpus.resetSentStats() if testCorpusPath: self.setPredictedTags(testCorpus) testEval = Evaluation(testCorpus) print "Testing evaluation for", i, "iteration(s):\n", testEval.format( ) testCorpus.resetSentStats( ) # !!! we can use prototype pattern(so we don't need to loop through sents): here testCorpus = testCorpus.getPrototype() and in Corpus::__init__ : self.prototype = self (google : python prototype)?
def test(): print("Creating new markov dict...") print(getDepth()) corpus_path = "./corpora/test/depth.txt" corpus = Corpus(importStrFromFile(corpus_path)) print(corpus) reverse_corpus = list(reversed(corpus)) print(reverse_corpus) forward_markov_dict = MarkovDict(source=corpus, depth=getDepth()) reverse_markov_dict = MarkovDict(source=reverse_corpus, depth=getDepth()) pprint(forward_markov_dict.dict) pprint(reverse_markov_dict.dict) bot = MarkovBot(forward_markov_dict, reverse_markov_dict) # pprint(bot.forward_dict.dict) # pprint(bot.reverse_dict.dict) print(bot.response(topic="markov"))
def main(): print("Creating new markov dict...") forward_markov_dict = MarkovDict(source=None, depth=getDepth()) reverse_markov_dict = MarkovDict(source=None, depth=getDepth()) print("Starting for loop to add corpora...") for corpus_path in corporaPaths(): corpus = Corpus(importStrFromFile(corpus_path)) print("Adding corpus with path '" + corpus_path + "'...") forward_markov_dict.add(corpus) reverse_markov_dict.add(list(reversed(corpus))) print("Initializing MarkovBot...") bot = MarkovBot(forward_markov_dict, reverse_markov_dict) print("\nWelcome to MarkovBot! Type a message. Type 'exit()' to quit.") message = prompt() while message != "exit()": print(bot.response(topic=message.split()[0])) message = prompt()
def main(): if (len(sys.argv) != 6): print "usage: python main.py <init_alpha> <modeldir_name> <num_topic> <data_file> <random/load>" sys.exit(1) init_alpha = float(sys.argv[1]) directory = sys.argv[2] num_topics = int(sys.argv[3]) data_file = sys.argv[4] start_type = sys.argv[5] # read_data corpus = Corpus() corpus.read_data(data_file) # Run LDA LdaEstimator.run_EM(init_alpha, directory, num_topics, corpus, start_type)
def main(): args = config() ns = args.ns # value for ns part = args.part # part or full ng_small = args.ng_small # ngram_smallest_value ng_big = args.ng_big # ngram_biggest_value use_subsample = args.subsample # use_subsample or not corpus = Corpus(part, ng_small, ng_big, use_subsample) emb, _ = word2vec_trainer(corpus, ns=ns, dimension=64, learning_rate=0.05, iteration=50000) # Print similar words testwords = [ "narrow-mindedness", "department", "campfires", "knowing", "urbanize", "imperfection", "principality", "abnormal", "secondary", "ungraceful" ] sim(testwords, corpus, emb)
def __init__(self, data_path, corpus_file): """ WorbEmb class init Parameters ---------- data_path : str data full path corpus_file : str protein domain corpus file name Returns ------- None """ self.data_path = data_path self.corpus_file = corpus_file self.Corpus = Corpus(self.data_path, self.corpus_file) self.w2v_model = "none" self.w2v_file_out = ""
def downloadCorpus(snapshotDir, corpusDir, projectName, configInfo): # 2. Dump the snapshots for a project msg = '---------------------------------------------------- \n' msg += ' Dump the corpus for project %s \n' % projectName msg += '---------------------------------------------------- \n' print(msg) project_snapshot_dir = os.path.join(snapshotDir, projectName) project_corpus_dir = os.path.join(corpusDir, projectName) if os.path.isdir(project_corpus_dir): print "!! %s already exists...returning \n" % project_corpus_dir #return corpus = Corpus(project_snapshot_dir, 'java', project_corpus_dir, configInfo) #logging.debug(corpus) #print corpus corpus.dump()
def create_gannt_corpus_obj(): corpus = Corpus() doc_id = 0 source_file_directory = 'GANNT/high/' target_file_directory = 'GANNT/low/' # Gets a list of the source document file names and sort by ID number source_file_names = [ f for f in listdir(source_file_directory) if isfile(join(source_file_directory, f)) ] source_file_names.sort() # Iterates through the source document files and stores it's contents and names in document dictionary # Dictionary's key is an id number that increments with each document. for fileName in source_file_names: doc_name = fileName.rstrip('.txt') temp_document = Document( doc_id, doc_name, open(source_file_directory + fileName, 'r').read().rstrip("\n")) corpus.add_source_document(temp_document) doc_id += 1 # Gets a list of the target document file names and sort by ID number target_file_names = [ f for f in listdir(target_file_directory) if isfile(join(target_file_directory, f)) ] target_file_names.sort() # Get and add to for fileName in target_file_names: doc_name = fileName.rstrip('.txt') temp_document = Document( doc_id, doc_name, open(target_file_directory + fileName, 'r').read().rstrip("\n")) corpus.add_target_document(temp_document) doc_id += 1 return corpus
def main(): clustered_corpus_path = 'clustered_corpus' clustered_corpus = read_clustered_corpus(clustered_corpus_path) corpus = merge_clustered_corpus_into_a_single_corpus(clustered_corpus) target_file_path = 'target.txt' text = read_text_file(target_file_path) document = Document(text) corpus = Corpus(corpus) clustered_corpus = ClusteredCorpus(clustered_corpus) candidate_to_rank_mapping = {} candidate_to_params_mapping = {} candidate_to_dfs_in_each_cluster_mapping = {} for candidate in document.get_candidates(): tf = math.log(1.0 + document.get_tf_for(candidate), 10.0) # tf = document.get_tf_for(candidate) idf = math.log(1.0 + 1.0 / corpus.get_df_for(candidate), 2.0) cu = clustered_corpus.get_cu_for(candidate) rank = cu # rank = tf * cu # rank = tf * idf dfs_in_each_cluster = clustered_corpus.get_dfs_in_each_cluster_for(candidate) candidate_representative = corpus.get_representative_for(candidate) candidate_to_rank_mapping[candidate_representative] = rank candidate_to_params_mapping[candidate_representative] = (tf, idf, cu) candidate_to_dfs_in_each_cluster_mapping[candidate_representative] = dfs_in_each_cluster table = generate_table_based_on( candidate_to_rank_mapping, candidate_to_params_mapping, candidate_to_dfs_in_each_cluster_mapping ) save_as_file(table) print('Done.')
def main(args): if args[1] == 'repair': if args[2] == 'all': datasets = ['be5', 'dagger', 'milo', 'okhttp', 'picasso'] else: datasets = args[2:] for dataset in tqdm(datasets): repair_real(dataset) if args[1] == 'gen_training_data': project_path = args[2] checkstyle_file_path = args[3] project_name = args[4] corpus_dir = create_corpus( project_path, project_name, checkstyle_file_path ) corpus = Corpus(corpus_dir, project_name) share = { key:core_config['DATASHARE'].getint(key) for key in ['learning', 'validation', 'testing'] } synthetic.gen_dataset(corpus, share, target_dir=f'./styler/{project_name}-errors' ) ml.gen_IO(f'./styler/{project_name}-errors', f'./styler/{project_name}-tokens', only_formatting=True) pass
def create_icebreaker_corpus_obj(): source_file = 'IceBreaker/Requirements.csv' target_file = 'IceBreaker/ClassDiagram.csv' corpus = Corpus() index = 0 with open(source_file) as csv_file: csv_reader = csv.reader(csv_file, delimiter=',') for name, desc in csv_reader: doc = Document(index, name, desc) corpus.add_source_document(doc) csv_file.close() with open(target_file) as csv_file: csv_reader = csv.reader(csv_file, delimiter=',') for name, desc in csv_reader: doc = Document(index, name, desc) corpus.add_target_document(doc) csv_file.close() return corpus
def get_description(self): ''' This function gets all the url, finds their description text and update them to the database ''' #get doc_id self.mycursor.execute("select id,url from doc") myresult = self.mycursor.fetchall() for doc_id, url in myresult: #print("**********Doc ID is "+str(doc_id)+" ********") c = Corpus() name = c.url_to_dir(url) #print("Name is "+ name) with open(name, "rb") as file: content = file.read() soup = BeautifulSoup(content, "lxml") metas = soup.find_all("meta") result = '' for meta in metas: if ('content' in meta.attrs) and ('name' in meta.attrs) and \ ((meta.attrs['name'] == 'description') or (meta.attrs['name'] == 'keywords')): result = " ".join(meta.attrs['content'].split()) #if html doesn't have description tag if result == '': script = soup.find( ["h1", "h2", "h3", "h4", "h5", "strong", "title", "b"]) if script: temp = " ".join(script.text.split()) result += temp if len(temp) < 200 else "" print(result) i_sql = "update doc set description =%s where id = %s" i_val = (result, doc_id) self.mycursor.execute(i_sql, i_val) self.mydb.commit() print(self.mycursor.rowcount, "was inserted in DOC , DOC ID IS " + str(doc_id))