def test(self, directory): c = Corpus(directory) with open(path.join(directory, "!prediction.txt"), 'w', encoding="utf-8") as output: for (filename, wordlist) in c.wordlists(): prob_spam = self.classifier.prob_list(wordlist) is_spam = prob_spam >= self.analyser.p_treshold tag = "SPAM" if is_spam else "OK" output.write(filename + " " + tag + "\n")
def label_chunk_file(file_to_label: Path, output_file: Path): r""" Perform NER on file \p file_to_label and write to \p output_file """ label_corpus = Corpus(file_to_label) label_corpus.fit_features() data_path = file_to_label.with_suffix(".dat_chunk") label_corpus.export(data_path) output_file.parent.mkdir(parents=True, exist_ok=True) label_with_maxent(data_path=data_path, model_path=MODEL_PATH, output_file=output_file)
def test(self, mails_path): try: os.remove(mails_path + "/!prediction.txt") except: pass corpus = Corpus(mails_path) with open(mails_path + "/!prediction.txt", 'a', encoding='utf-8') as f: for mail in corpus.emails(): res = self.evaluate_mail(mail[1]) f.write(f"{mail[0]} {self.pos_tag if res else self.neg_tag}\n")
def resume_segmentation(iterations=10): logger.debug('Resume segmentation') corpus = Corpus(Q=opt.gmm, subaction=opt.subaction) for iteration in range(iterations): logger.debug('Iteration %d' % iteration) corpus.iter = iteration corpus.resume_segmentation() corpus.accuracy_corpus() corpus.accuracy_corpus()
def build_model(train_path: Path): r""" Construct the learner model """ train_corpus = Corpus(train_path) train_corpus.fit_features() data_path = train_path.with_suffix(".dat_name") train_corpus.export(data_path) if MODEL_PATH.exists(): MODEL_PATH.unlink() # Delete the existing model MODEL_PATH.parent.mkdir(exist_ok=True, parents=True) train_maxent_model(data_path=data_path, model_path=MODEL_PATH)
def __init__(self, folder): self.folder = folder self.spams = [] self.hams = [] corp = Corpus(folder) for fname, content in corp.emails(): if self.is_ham(fname): self.hams.append(Email(fname, content)) else: self.spams.append(Email(fname, content))
def load_quora(): QUORA_PATH = '/home/mgimenez/Dev/corpora/Quora/quora_duplicate_questions.tsv' dataset = Corpus('quora', QUORA_PATH) train_non_sim, train_sim, dev_non_sim, dev_sim, \ test_non_sim, test_sim, \ vocab_processor, seq_len = dataset.make_partitions_quora() return train_non_sim, train_sim, dev_non_sim, dev_sim, \ test_non_sim, test_sim, vocab_processor, seq_len
def test_tok1_interro(): """ Check that indexes can be shown """ corpus = Corpus(tok_path) res = corpus.interrogate(show=['s', 'i', 'l']) sortd = res.results[sorted(res.results.columns)] three = ['0/0/corpus', '0/0/this', '0/1/linguistics'] assert_equals(list(sortd.columns)[:3], three) assert_equals(sortd.sum().sum(), 77)
def __init__(self, frontier): self.frontier = frontier self.corpus = Corpus() self.most_links = (None,0) #keeps track of page with most valid out links self.traps = set() ##next 3 attributes are used to compare consecutive links in regards to trap detection self.old_link = None self.old_path = None self.old_query = None
def main(): QAfile = sys.argv[1] ReviewFile = sys.argv[2] minReview = int(sys.argv[3]) k = int(sys.argv[4]) numiter = int(sys.argv[5]) Lambda = float(sys.argv[6]) predictionsOut = sys.argv[7] rankingOut = sys.argv[8] corpus = Corpus(QAfile, ReviewFile, minReview) corpus.construct_QAnswersAndQPerItem() corpus.construct_SentencesAndSPerItem() corpus.Calculate_PairWiseFeature() print "Vocabulary Size: " + str(corpus.Map.V) print "Number of Questions: " + str(len(corpus.QAnswers)) print "Number of Reviews: " + str(len(corpus.Sentences)) print "Number of Items " + str(len(corpus.Map.ItemIDMap)) print "Avg review length " + str( sum(corpus.Avgdl.values()) / len(corpus.Avgdl)) model = Model(k, numiter, Lambda, corpus) sess = model.train_model() print "\nModel is trained and optimal model loaded!\n" valid_accuracy, test_accuracy, topRanked = model.valid_test_perf(sess) if (predictionsOut): model.save_predictions(topRanked, predictionsOut) if (rankingOut): topRanked = model.top_ranked(sess, 10) model.save_top_ranked(topRanked, rankingOut) print "Predictions are saved\n" valid_AUC, test_AUC = model.AUC(sess) print "-----------------------------------------------" print "----------------------------------------------\n" print "Accuracy: " print "\tValidation: " + str(valid_accuracy) print "\tTest: " + str(test_accuracy) print "\n" print "AUC: " print "\tValidation: " + str(valid_AUC) print "\tTest: " + str(test_AUC) print "\n" print "-----------------------------------------------" print "----------------------------------------------\n"
def parse(self, corenlppath=False, operations=False, copula_head=True, speaker_segmentation=False, memory_mb=False, *args, **kwargs): """ Parse an unparsed corpus, saving to disk :param corenlppath: folder containing corenlp jar files :type corenlppath: str :param operations: which kinds of annotations to do :type operations: str :param speaker_segmentation: add speaker name to parser output if your corpus is script-like: :type speaker_segmentation: bool :param memory_mb: Amount of memory in MB for parser :type memory_mb: int :param copula_head: Make copula head in dependency parse :type copula_head: bool :Example: >>> parsed = corpus.parse(speaker_segmentation = True) >>> parsed <corpkit.corpus.Corpus instance: speeches-parsed; 9 subcorpora> :returns: The newly created :class:`corpkit.corpus.Corpus` """ from make import make_corpus from corpus import Corpus #from process import determine_datatype #dtype, singlefile = determine_datatype(self.path) if self.datatype != 'plaintext': raise ValueError( 'parse method can only be used on plaintext corpora.') kwargs.pop('parse', None) kwargs.pop('tokenise', None) return Corpus( make_corpus(self.path, parse=True, tokenise=False, corenlppath=corenlppath, operations=operations, copula_head=copula_head, speaker_segmentation=speaker_segmentation, memory_mb=memory_mb, *args, **kwargs))
def __init__(self, frontier): self.frontier = frontier self.corpus = Corpus() #create variables needed for analtics calculation self.traps = set() self.downloads = set() self.valid_links_count = 0 self.url_with_most_valid_links = '' self.current_highest_valid_link_count = -1 self.subdomains_and_frequencies = defaultdict(int)
def __init__(self, frontier): self.frontier = frontier self.corpus = Corpus() self.mostValidOut = 0 self.mostValidOutURL = "" self.listTraps = [] self.subdomainCount = {} self.prev_query = [] self.param = "" self.count = 0 self.urldeque = deque(maxlen=10)
def load_ibm(): """ Load the train and dev datasets """ IBM_PATH = '/home/mgimenez/Dev/corpora/Quora/IBM' TRAIN_PATH = join(IBM_PATH, 'train.tsv') train = Corpus('ibm', TRAIN_PATH) DEV_PATH = join(IBM_PATH, 'dev.tsv') dev = Corpus('ibm', DEV_PATH) TEST_PATH = join(IBM_PATH, 'test.tsv') test = Corpus('ibm', TEST_PATH) vocab_processor, seq_len = build_vocabulary(train.sim_data, train.non_sim_data) train.to_index(vocab_processor) dev.to_index(vocab_processor) test.to_index(vocab_processor) return train.non_sim_data, train.sim_data, \ dev.non_sim_data, dev.sim_data, \ test.sim_data, test.non_sim_data, \ vocab_processor, seq_len
def generateData(): rep = Representor(None, 'citybeat', 'next_week_candidate_event_25by25_merged') corpus = Corpus() corpus.buildCorpusOnDB('citybeat', 'next_week_candidate_event_25by25_merged') true_event_list, false_event_list = loadNextWeekData() EventFeatureTwitter(None).GenerateArffFileHeader() for event in true_event_list + false_event_list: EventFeatureTwitter(event, corpus, rep).printFeatures()
def __init__(self, path): parsemeCorpus = Corpus(path) self.train_sents = None self.dev_sents = None self.test_sents = None if parsemeCorpus.sentences: self.train_sents = parsemeCorpus.sentences if parsemeCorpus.devSents: self.dev_sents = parsemeCorpus.devSents if parsemeCorpus.testSents: self.test_sents = parsemeCorpus.testSents
def next(self): urls = [] searchURL = GoogleNewsURLCrawler._getNextSearchURL(self.queryExpression, self.pageCount, self.pageSizeToRetreive) self.pageCount += self.pageSizeToRetreive GoogleNewsURLCrawler.chromeDriver.get(searchURL) searchResults = GoogleNewsURLCrawler.chromeDriver.find_elements_by_class_name('_hJs') for searchResult in searchResults: try: link = searchResult.find_element_by_tag_name('h3').find_element_by_tag_name('a').get_attribute('href') date = searchResult.find_element_by_class_name('slp').find_element_by_class_name('_QHs').get_attribute('innerText') if date.find('ago') != -1: date = GoogleNewsURLCrawler._GetToday() urls.append((link, date)) except Exception as errorMessage: print(errorMessage) continue corpora = [] for url in urls: try: GoogleNewsURLCrawler.chromeDriver.get(url[0]) terms = '' try: for element in GoogleNewsURLCrawler.chromeDriver.find_element_by_tag_name('body').find_elements_by_xpath(".//*"): if (element.tag_name != 'style') and (element.tag_name != 'script'): import re try: innerText = element.get_attribute('innerText') if innerText != None: tokens = list(filter(lambda element: (element != None) and (element != ''), re.split(r'\s+|\t+|\n+|,|:|;|\.', innerText))) for token in tokens: terms += (token + ' ') except Exception as ex: print('exception: ' + ex + 'exception text: ' + innerText) print('\n') print('\n') continue except Exception: continue corpora.append(Corpus(self.keyword, url[1], url[0], terms)) except Exception: continue return corpora
def __init__(self): """ Initializing an instance of LDA. """ # corpus of a LDA model. self.corpus = Corpus() # the distribution of topics over terms. self.topic_distribution_over_term = None # the distribution of documents over topics. self.document_distribution_over_topic = None # gibbs sampler. self.gibbs_sampler = None
def main(): args = get_args() corpus = Corpus("text8", args.gram_min, args.gram_max, args.part == "part") subword_embeddings, _ = train_fasttext(corpus, ns_num=args.ns, window_size=5, dimension=100, learning_rate=0.01, epoch=1, subsampling=True) find_similar_words(corpus, subword_embeddings, args.gram_min, args.gram_max)
def __init__(self, frontier): self.frontier = frontier self.corpus = Corpus() self.subdomains = {} self.subdomains["ics.uci.edu"] = 0 self.downloaded_urls = [] self.out_links = {} self.traps = [] self.checked = {} self.threshold = 1000
def train(self, dot_path): print("Begin train...") corpus = Corpus() corpus_data = corpus.load_corpus(dot_path) for data in corpus_data: for conversation in data: # conversation表示一个会话 statement_history = [] for text in conversation: # 依次从训练对话中取句子 if statement_history: self.storage.add(statement_history[-1], text.encode('utf-8')); statement_history.append(text.encode('utf-8')) # 当前语句加入历史列表 print("End of train!")
def main(): # Commandline argument parsing stuff default_dir = 'sample-data' directory = argv[1] if len(argv) == 2 else default_dir if not all([len(argv) <= 2, os.path.isdir(directory)]): print """Usage : python {} <directory> <directory> : path to directory of annotation XMLs""".format(argv[0]) exit() corpus = Corpus(directory) # Setup accumulators annotator_ids = set() document_ids = set() extents = set() tag_types = {'NONE'} # First pass over corpus to accumulate IDs/labels for document in corpus: document_id, annotator_id, phase = parse_name(document) document_ids.add(document_id) annotator_ids.add(annotator_id) tag_types.update(document.extent_types) for tag in document.consuming_tags(): extents.add(get_extent(document_id, tag)) # Index all the things annotators, labels, subjects = map(index, (annotator_ids, tag_types, extents)) # Setup numpy array to store the data shape = (len(subjects), len(labels)) data = numpy.zeros(shape, dtype=int) extents = dict.fromkeys(extents) # Second pass over the corpus to populate extents dictionary for document in corpus: document_id, annotator_id, phase = parse_name(document) for tag in document.consuming_tags(): extent = get_extent(document_id, tag) if not extents[extent]: extents[extent] = ['NONE'] * len(annotators) annotator = annotators[annotator_id] extents[extent][annotator] = tag.tag # Final pass over the extents dictionary to populate data matrix for subject, annotations in extents.iteritems(): for label in annotations: row, column = subjects[subject], labels[label] data[row, column] += 1 print "Fleiss's kappa : {}".format(fleiss.kappa(data))
def train(self, train_corpus_dir): """ Train the corpus on given emails dataset :param train_corpus_dir: """ self.truth = utils.read_classification_from_file( train_corpus_dir) # load truth train_corpus = Corpus(train_corpus_dir) self.get_SPAM_percentage(train_corpus) # not in use now for val in email_keys: # get values for parts of email header self.classify_part(train_corpus, val.lower()) self.classify_payload( train_corpus) # get values for email payload/body
def evaluate(self, corpus_path): test_corpus = Corpus(corpus_path) test_set = [{ 'x': list(self.__char_processor.transform(item['text']))[0], 'y': list(self.__tag_processor.transform(item['tag']))[0], 'length': item['length'] } for item in test_corpus.items()] result = self.__estimator.evaluate( input_fn=lambda: self.__input_fn(test_set), ) print('Test: %d' % len(test_corpus)) print(result)
def addChannel(self, paths): c = Corpus() for path in paths: print path with open(path) as f: text = f.read() name = path.split('/')[-1] d = Document(name, text) c.add_document(d) self.sourceboard.addChannel(c) self.refresh() self.frame.fSizer.Layout() self.frame.Fit()
def document_check(): """ Check that the document lazy attribute works """ corpus = Corpus(speak_path) df = corpus[0][0].document fir = ['This', 'this', 'DT', 'O', 3, 'det', '0', '1'] assert_equals(list(df.ix[1,1], fir)) kys = list(df._metadata[5].keys()) lst = ['year', 'test', 'parse', 'speaker', 'num'] assert_equals(kys, lst) assert_equals(corpus.files, None) assert_equals(corpus.datatype, 'conll')
def __init__(self, flavor, user_configs): ########################################################################## # define directories self.runs_dir = load_rnnlabrc('runs_dir') self.log_path = os.path.abspath( os.path.join(os.path.expanduser('~'), 'rnnlab_log.csv')) ########################################################################## # make configs dict self.configs_dict = self.make_configs_dict(user_configs, flavor) ########################################################################## # calc num_epochs self.num_reps = int(self.configs_dict['num_reps']) self.num_iterations = int(self.configs_dict['num_iterations']) self.num_epochs = int(self.num_reps / self.num_iterations) if not self.num_reps % self.num_iterations == 0: sys.exit( 'rnnlab: "num_reps" must be divisible by "num_iterations"') print 'Num reps: {} / Num iterations: {} --> Num epochs : {}'.format( self.num_reps, self.num_iterations, self.num_epochs) ########################################################################## # make corpus corpus_kwargs = { key: self.configs_dict[key] for key in [ 'corpus_name', 'vocab_file_name', 'freq_cutoff', 'probes_name', 'mb_size', 'bptt_steps', 'num_mbs_in_doc', 'block_order' ] } corpus_kwargs['num_epochs'] = self.num_epochs self.corpus = Corpus(**corpus_kwargs) ########################################################################## # create rnn_graph num_input_units = len(self.corpus.token_list) self.rnn_graph = create_rnn_graph(num_input_units, self.configs_dict) ########################################################################## # assign instance variables self.model_name = self.configs_dict['model_name'] self.block_order = str(self.configs_dict['block_order']) self.n_data = int(self.configs_dict['n_data']) self.mb_size = int(self.configs_dict['mb_size']) self.num_hidden_units = int(self.configs_dict['num_hidden_units']) self.bptt_steps = int(self.configs_dict['bptt_steps']) self.num_ba_samples = int(self.configs_dict['num_ba_samples']) self.probes_ba_list = [] ########################################################################## # calc instance variables self.stop_mb = self.corpus.num_train_doc_ids * self.num_reps * self.corpus.num_mbs_in_doc print 'Stop minibatch: {:,}'.format(self.stop_mb) mb_n = int(self.stop_mb / self.n_data) self.data_mbs = np.arange(mb_n, (mb_n * self.n_data) + mb_n, mb_n) print 'self.data_mbs :', self.data_mbs
def applySingleRules(IDsFilename): """ Should originally just apply one rule. Is now used to apply one feature to the given corpus. So it basically shows how often each feature occurs in ironic and regular reviews. """ print("Using the set at '{path}{file}'".format(path=CORPUS_PATH, file=IDsFilename)) print("Creating reviews...(this may take a while)") dataSet = Corpus(IDsFilename, corpusPath=CORPUS_PATH) print("Loading reviews...") # dataSet = Corpus.loadCorpus(filename="training_set.pk") # dataSet = Corpus.loadCorpus(filename="training_and_validation_set.pk") print("Extracting features...") features, featureVectors = extractFeatures(dataSet.reviewIDs, dataSet.reviews) showFeatureOccurrence(features, featureVectors) gold = dataSet.goldStandard # decisiveFeatureNames = ["Scare quotes", # "Positive star polarity discrepancy", # "Negative star polarity discrepancy", # "Positive Ppunctuation", # "Negative Ppunctuation", # "Streak of Positive Words", # "Ellipsis and Punctuation", # "Emoticon Happy", "Emoticon Laughing", # "Emoticon Winking", "Emotion Tongue", # "LoLAcroym", "GrinAcronym", "Onomatopoeia", # "Interrobang"] decisiveFeatureNames = [f.name for f in features] for d in decisiveFeatureNames: classification = classify(features, featureVectors, [d]) targets = [] cls = [] for ID, g in gold.items(): targets.append(g) cls.append(classification[ID]) print("\nClassifying by rule: ", d) showPerformance(targets, cls)
def test(self, test_corpus_dir): ''' Creates dict of classification and writes it to the file :param test_corpus_dir: path to test dir :return: None ''' # Prepare "global" variables c = Corpus(test_corpus_dir) class_dict = {} # Iterate over emails with generator in Corpus for email in c.emails(): # Declare probabilities - will be modified spam_probability = 0 ham_probability = 0 # Get word statistics of email - word frequency and word count word_stats = self.get_word_count_for_mail(email[1]) word_freq = word_stats[0] word_count = word_stats[1] # Compute spamines of words spaminesses = [] for word in word_freq: s = self.get_spaminnes_of_word(word) if s is not None: spaminesses.append(s) # Caluclates needed parts for further computation product = self.prod(spaminesses) one_without_spammineses = self.one_without_spaminesses(spaminesses) lower = product + one_without_spammineses # We cannot divide by zero if lower != 0: overall_spaminess = product / (product + one_without_spammineses) else: overall_spaminess = 0 # Final decision if overall_spaminess >= 0.5: class_dict.update({email[0]: "SPAM"}) else: class_dict.update({email[0]: "OK"}) # Creates !prediction.txt file utils.write_classification_to_file( test_corpus_dir + "/!prediction.txt", class_dict)
def test(self, email_adress): global all_words, spam_words, probability_spam, count_spams, count_emails # part without train if probability_spam == 0: html_words = ['<html>', '<p>', '</a>', '<br>', '<head>', '<meta>', '<title>', '<body>'] fnames_with_body = Corpus(email_adress).emails() f = open(str(email_adress + '/!prediction.txt'), 'w', encoding="utf-8") for fname in fnames_with_body: for word in html_words: if word in fname[1]: # if word there are in email's body -> It's SPAM! f.write(str(fname[0] + ' SPAM\n')) break f.write(str(fname[0] + ' OK\n')) # Else it's probably ham =\ f.close() # part with train else: fnames_with_body = Corpus(email_adress).emails() f = open(str(email_adress + '/!prediction.txt'), 'w', encoding="utf-8") for fname in fnames_with_body: email_words = TrainingCorpus.get_words_from_email(fname[1]) probability_spam_words = [] for word in email_words: # skip empty words and about know nothing if (word not in all_words) or (word == ''): continue if word not in spam_words: probability_spam_word = 0 if word in spam_words: # Bayes' theorem. What is the probability that email is spam, if it has this word probability_spam_word = ( spam_words[word]/count_spams * probability_spam) / (all_words[word]/count_emails) probability_spam_words.append(probability_spam_word) # Final probability that email is spam probability_spam_email = sum(probability_spam_words)/len(probability_spam_words) *100 if probability_spam_email > 70: f.write(str(fname[0] + ' SPAM\n')) else: f.write(str(fname[0] + ' OK\n')) f.close()