def __init__(self, datapath, tree): # decision tree get from the learner self.DT = tree if datapath != None: test_file = datapath if len(sys.argv) <= 1: test_file = "../data/origin/test.csv" else: test_file = sys.argv[1] f = open(test_file, "r") lines = f.readlines() f2 = open("btest_output.csv", "w") # get the list of attributes readInput = Datareader(lines) attrNames = readInput.attrNames records = readInput.records resultAttr = readInput.resultAttr typeList = readInput.typeList helper.process(records, attrNames[:len(attrNames)-1], typeList) self.predict(records, tree, resultAttr) for record in records: for attr in attrNames: val = record[attr] if attr == resultAttr: val = int(val) f2.write(str(val) + ',') f2.write('\n')
def __init__(self, datapath, tree): # decision tree get from the learner self.DT = tree if datapath != "None": validate_file = datapath elif len(sys.argv) <= 1: validate_file = "bvalidate.csv" else: validate_file = sys.argv[1] f = open(validate_file, "r") lines = f.readlines() # get the list of attributes readInput = Datareader(lines) attrNames = readInput.attrNames records = readInput.records resultAttr = readInput.resultAttr typeList = readInput.typeList helper.process(records, attrNames, typeList) print print "---> Using the validation set, the prediction accuracy is : " print(self.validate(records, self.DT, resultAttr)) print
def __init__(self, datapath): if datapath != "None": train_file = datapath elif len(sys.argv) <= 1: train_file = "btrain.csv" #train_file = "../data/test.csv" else: train_file = sys.argv[1] f = open(train_file, "r") lines = f.readlines() lines = lines[:16000] # get the list of attributes readInput = Datareader(lines) attrNames = readInput.attrNames records = readInput.records resultAttr = readInput.resultAttr typeList = readInput.typeList print typeList helper.process(records, attrNames, typeList) print "read " + str(len(attrNames)) + " attributes" print "read " + str(len(records)) + " records" print self.tree = self.DTL(records, attrNames, resultAttr) print "-----> Printing DNF of the decision tree ..." helper.printDNF(self.tree) print
def process( self ): #process have been changed to get the news , process function in helper.py then for url in self.list: #process it into format that bikeso can accept resultPage = fetch(url) result = loads(resultPage)[self.content] yield (result['title'], helper.process(result[self.desc]))
def main(): directory = '/Users/jtim/Dropbox/Academic/research/dissertation/research/output/word-length' if not os.path.exists(directory): os.makedirs(directory) by_author = {} for author, files in corpora.items(): by_author[author] = read_files_into_string(files) # Transform the authors' corpora into lists of word tokens by_author_tokens = {} by_author_length_distributions = {} for author in by_author: tokens = by_author[author].split() # Filter out punctuation by_author_tokens[author] = ([process(token) for token in tokens if any(c.isalpha() for c in token)]) # Get a distribution of token lengths token_lengths = [len(token) for token in by_author_tokens[author]] plt.ion() by_author_length_distributions[author] = nltk.FreqDist(token_lengths) by_author_length_distributions[author].plot(9,title=author,color='grey') plt.savefig('{}/word-length-{}.png'.format(directory, author)) plt.ioff() plt.close("all")
def main(): pure_persian_works = [] arabic_counter = Counter() arabic_vocabulary = set() for name in file_names: if 'ar.txt' in name: with open("{}{}".format(root_dir, name), 'r') as f: #### Error, use Counter instead of set? #### words = set(process(f.read()).split()) arabic_counter.update(words) arabic_vocabulary.update(words) # remove infrequent words for word, count in arabic_counter.items(): if len( word ) < 3: # drop one and two letter words; they have high probablity of being a homograph arabic_vocabulary.remove(word) elif count <= 2: arabic_vocabulary.remove(word) with open("{}pure-persian.txt".format(directory), 'w') as out_file: for name in file_names: if 'fa.txt' in name: with open("{}{}".format(root_dir, name), 'r') as f: #### Error, use Counter instead of set? #### words = set(process(f.read()).split()) if len(words - arabic_vocabulary) > len( words ) * .70: # if x percent of words not in Arabic vocabulary out_file.write(name + "\n") out_file.write("Number of words: {}\n".format( len(words))) out_file.write("Percent Persian words: {}\n".format( len(words - arabic_vocabulary) / len(words))) out_file.write( "Words of possible Arabic origin: {}\n".format( words.intersection(arabic_vocabulary))) out_file.write("------------------------------\n\n") pure_persian_works.append(name) out_file.write("Pure Persian Works:\n") for i in pure_persian_works: out_file.write("\t" + "-" + i + "\n")
def chi_squared(relative_corpus, authors=[]): by_author = {} for key, files in corpora.items(): by_author[key] = read_files_into_string(files) # Transform the authors' corpora into lists of word tokens by_author_tokens = {} by_author_length_distributions = {} for author in by_author: tokens = by_author[author].split() # Filter out punctuation by_author_tokens[author] = ([ process(token) for token in tokens if any(c.isalpha() for c in token) ]) for author in authors: # First, build a joint corpus and identify the 500 most frequent words in it joint_corpus = (by_author_tokens[author] + by_author_tokens[relative_corpus]) joint_freq_dist = nltk.FreqDist(joint_corpus) most_common = list(joint_freq_dist.most_common(500)) # What proportion of the joint corpus is made up of the candidate # author's tokens? author_share = (len(by_author_tokens[author]) / len(joint_corpus)) # Now, let's look at the 500 most common words in the candidate # author's corpus and compare the number of times they can be observed # to what would be expected if the author's writings and the relative # corpus were both random samples from the same distribution. chisquared = 0 for word, joint_count in most_common: # How often do we really see this common word? author_count = by_author_tokens[author].count(word) relative_count = by_author_tokens[relative_corpus].count(word) # How often should we see it? expected_author_count = joint_count * author_share expected_joint_count = joint_count * (1 - author_share) # Add the word's contribution to the chi-squared statistic chisquared += ((author_count - expected_author_count) * (author_count - expected_author_count) / expected_author_count) chisquared += ((relative_count - expected_joint_count) * (relative_count - expected_joint_count) / expected_joint_count) print("The Chi-squared statistic for", author, "compared to", relative_corpus, "is", chisquared)
def main(): persian_words = ['خواهد', 'بايد', 'نزد', 'نماييد', 'اين', 'نموده', 'شما', 'اوست', 'بگو', 'شود', 'خود', 'هست', 'گشت', 'شويد', 'راه', 'بآن', 'امروز', 'نمايد', 'چون', 'شوند', 'دوستان', 'شده', 'بوده', 'آنكه', 'بود', 'آفتاب', 'اند', 'داده', 'فردا', 'شايد', 'چه', 'نيست', 'را' , 'آنچه’, 'شود’, ’آنچه', 'مانده', 'بيني', 'جان', 'باز', 'اگر', 'است', 'آمد', 'كنيد', 'سرا', 'نما', 'ميشود', 'نمود', 'دار', 'نبوده', 'شوي', 'ميفرمايد', 'دوست'] corrupt = [] pure_persian_works = [] arabic_counter = Counter() arabic_vocabulary = set() arabic_works_count = 0 for name in file_names: if 'ar.txt' in name: corrupt_word_count = 0 arabic_works_count += 1 with open(name, 'r') as f: words = set(process(f.read()).split()) arabic_counter.update(words) arabic_vocabulary.update(words) for word in words: if word in persian_words: corrupt_word_count += 1 if corrupt_word_count > 0: corrupt.append(name) print(corrupt) print("{} of {} are corrupted".format(len(corrupt), arabic_works_count))
shared_access_key_value='ak9L18tmI2FssJBIZLz3OCs8U55rcYZaSbwgAR6/B34=') for email in emails: try: # Split and Parse emails # Get Contents # Save to db doc = {} doc['from'] = fx.getContact(email.sender) doc['to'] = fx.getContacts(email.to_recipients) if email.cc_recipients: doc['cc'] = fx.getContacts(email.cc_recipients) doc['subject'] = email.subject.strip() msgs = fx.process(email.text_body) msgs = [[doc['subject']]] + msgs doc['emails'] = msgs doc['intent'] = fx.getIntent(msgs) #doc['intentions'] = fx.getIntentPerLine(msgs) doc['caseid'] = str(uuid.uuid4()) doc['state'] = 'new' doc['handoff'] = False doc['botHasReplied'] = False #print docid #Send Message to Incoming Queue event = Message(json.dumps(doc)) nttBus.send_queue_message('htn.incoming.emails', event) docid = sdmails.insert_one(doc).inserted_id except Exception,e:
# -*- coding: utf-8 -*- """ Created on Thu Apr 30 14:59:44 2020 @author: qinzhen """ import pickle from helper import process #读取单词 en, vocab_en = process("corpus.en", True) de, vocab_de = process("corpus.de") #t存储概率,t1存储e对应的全体f t = dict() t1 = dict() for e in vocab_en: t[e] = dict() #存储键 t1[e] = set() #迭代 for i, sentence in enumerate(en): for e in sentence: t1[e] = t1[e].union(set(de[i])) for e in t: ne = len(t1[e]) for f in t1[e]: t[e][f] = 1.0 / ne
def main(): directory = '/Users/jtim/Dropbox/Academic/research/dissertation/research/output/' if not os.path.exists(directory): os.makedirs(directory) by_author = {} for author, files in corpora.items(): by_author[author] = read_files_into_string(files) authors = ("Bahá'u'llah", "Bahá'u'llah Baghdad", "`Abdu'l-Bahá", "the Báb", "al-Shaykh Murtaḍá al-Ánsárí") # Transform the authors' corpora into lists of word tokens by_author_tokens = {} by_author_length_distributions = {} for author in by_author: tokens = by_author[author].split() # Filter out punctuation by_author_tokens[author] = ([process(token) for token in tokens if any(c.isalpha() for c in token)]) # Get a distribution of token lengths token_lengths = [len(token) for token in by_author_tokens[author]] plt.ion() by_author_length_distributions[author] = nltk.FreqDist(token_lengths) by_author_length_distributions[author].plot(15,title=author,color='grey') plt.savefig('{}figures/word-length-{}.png'.format(directory, author)) plt.ioff() plt.close("all") ### Chi-squared tests ### # Test 1: Compare the distance of Bahá'u'lláh's full corpus to His Baghdad writings # and the distance of Shaykh Murtaḍá's writings to Bahá'u'lláh's Baghdad writings authors_one = ("Bahá'u'llah", "al-Shaykh Murtaḍá al-Ánsárí") for author in authors_one: # First, build a joint corpus and identify the 500 most frequent words in it joint_corpus = (by_author_tokens[author] + by_author_tokens["Bahá'u'llah Baghdad"]) joint_freq_dist = nltk.FreqDist(joint_corpus) most_common = list(joint_freq_dist.most_common(500)) # What proportion of the joint corpus is made up of the candidate # author's (Bahá'u'llah and Shaykh Murtaḍá) tokens? author_share = (len(by_author_tokens[author]) / len(joint_corpus)) # Now, let's look at the 500 most common words in the candidate # author's (Bahá'u'llah and Shaykh Murtaḍá) corpus and compare # the number of times they can be observed to what would be expected # if the author's writings and Bahá'u'llah's Baghdad writings were # both random samples from the same distribution. chisquared = 0 for word, joint_count in most_common: # How often do we really see this common word? author_count = by_author_tokens[author].count(word) baghdad_count = by_author_tokens["Bahá'u'llah Baghdad"].count(word) # How often should we see it? expected_author_count = joint_count * author_share expected_joint_count = joint_count * (1-author_share) # Add the word's contribution to the chi-squared statistic chisquared += ((author_count-expected_author_count) * (author_count-expected_author_count) / expected_author_count) chisquared += ((baghdad_count-expected_joint_count) * (baghdad_count-expected_joint_count) / expected_joint_count) print("The Chi-squared statistic for", author, "compared to Bahá'u'llah's Baghdad writngs is", chisquared) # Test 2: Now consider the relative distance between the writings or `Abdu'l-Bahá` # and the writings of Bahá'u'lláh compared to the writings of Shaykh Murtaḍá. authors_two = ("`Abdu'l-Bahá", "al-Shaykh Murtaḍá al-Ánsárí") for author in authors_two: # First, build a joint corpus and identify the 500 most frequent words in it joint_corpus = (by_author_tokens[author] + by_author_tokens["Bahá'u'llah"]) joint_freq_dist = nltk.FreqDist(joint_corpus) most_common = list(joint_freq_dist.most_common(500)) # What proportion of the joint corpus is made up of the candidate # author's (`Abdu'l-Bahá` and Shaykh Murtaḍá) tokens? author_share = (len(by_author_tokens[author]) / len(joint_corpus)) # Now, let's look at the 500 most common words in the candidate # author's (`Abdu'l-Bahá` and Shaykh Murtaḍá) corpus and compare # the number of times they can be observed to what would be expected # if the author's writings and Bahá'u'llah's writings were both # random samples from the same distribution. chisquared = 0 for word, joint_count in most_common: # How often do we really see this common word? author_count = by_author_tokens[author].count(word) baghdad_count = by_author_tokens["Bahá'u'llah"].count(word) # How often should we see it? expected_author_count = joint_count * author_share expected_joint_count = joint_count * (1-author_share) # Add the word's contribution to the chi-squared statistic chisquared += ((author_count-expected_author_count) * (author_count-expected_author_count) / expected_author_count) chisquared += ((baghdad_count-expected_joint_count) * (baghdad_count-expected_joint_count) / expected_joint_count) print("The Chi-squared statistic for", author, "compared to Bahá'u'llah's writngs is", chisquared)
def read_files_into_string(filenames): strings = [] for filename in filenames: with open(filename) as f: strings.append(process(f.read())) return '\n'.join(strings)
def main(): directory = '/Users/jtim/Dropbox/Academic/sources/corpora/bahai-corpus/output/islamicate-texts/' if not os.path.exists(directory): os.makedirs(directory) # Islamicate variables arabic_islamicate_counter = Counter() persian_islamicate_counter = Counter() arabic_islamicate_vocabulary = set() persian_islamicate_vocabulary = set() arabic_islamicate_word_count = 0 persian_islamicate_word_count = 0 arabic_islamicate_vocabularies = [] persian_islamicate_vocabularies = [] # Baha'i variables arabic_bahai_counter = Counter() persian_bahai_counter = Counter() arabic_bahai_vocabulary = set() persian_bahai_vocabulary = set() arabic_bahai_word_count = 0 persian_bahai_word_count = 0 arabic_bahai_vocabularies = [] persian_bahai_vocabularies = [] # Process Islamicate files for a_file in arabic_islamicate_files: with open(a_file, 'r') as af: words = Counter(process(af.read()).split()) arabic_islamicate_word_count += len(words) arabic_islamicate_vocabulary.update(words) arabic_islamicate_counter.update(words) for p_file in persian_islamicate_files: with open(p_file, 'r') as pf: words = Counter(process(pf.read()).split()) persian_islamicate_word_count += len(words) persian_islamicate_vocabulary.update(words) persian_islamicate_counter.update(words) # Process Baha'i files for a_file in arabic_bahai_files: with open(a_file, 'r') as af: words = Counter(process(af.read()).split()) arabic_bahai_word_count += len(words) arabic_bahai_vocabulary.update(words) arabic_bahai_counter.update(words) for p_file in persian_bahai_files: with open(p_file, 'r') as pf: words = Counter(process(pf.read()).split()) persian_bahai_word_count += len(words) persian_bahai_vocabulary.update(words) persian_bahai_counter.update(words) # Common variables minimum_threshold = 5 for i in range(1, minimum_threshold): # remove minimum_threshold number of words for item in arabic_islamicate_counter.items(): if item[1] <= i: if item[0] in arabic_islamicate_vocabulary: arabic_islamicate_vocabulary.remove(item[0]) for item in persian_islamicate_counter.items(): if item[1] <= i: if item[0] in persian_islamicate_vocabulary: persian_islamicate_vocabulary.remove(item[0]) for item in arabic_bahai_counter.items(): if item[1] <= i: if item[0] in arabic_bahai_vocabulary: arabic_bahai_vocabulary.remove(item[0]) for item in persian_bahai_counter.items(): if item[1] <= i: if item[0] in persian_bahai_vocabulary: persian_bahai_vocabulary.remove(item[0]) combined_bahai_counter = arabic_bahai_counter + persian_bahai_counter bahai_intersection = persian_bahai_vocabulary.intersection( arabic_bahai_vocabulary) islamicate_intersection = persian_islamicate_vocabulary.intersection( arabic_islamicate_vocabulary) # Print Islamicate stats print("Islamicate language statistics:") print("Arabic word count: {}".format(arabic_islamicate_word_count)) print("Persian word count: {}".format(persian_islamicate_word_count)) print("Arabic vocabulary: {}".format(len(arabic_islamicate_vocabulary))) print("Persian vocabulary: {}".format(len(persian_islamicate_vocabulary))) print("intersection: {} ".format(len(islamicate_intersection))) print('\n') with open('{}islamicate_intersection.txt'.format(directory), 'w') as out_file: for w in islamicate_intersection: out_file.write(w + '\n') with open('{}islamicate_intersection_sample.txt'.format(directory), 'w') as out_file: sample = random.sample(list(islamicate_intersection), round((len(islamicate_intersection) / 100)) * 2) for w in sample: out_file.write(w + '\n') # Print Baha'i stats print("Bahá'í language statistics:") print("Arabic word count: {}".format(arabic_bahai_word_count)) print("Persian word count: {}".format(persian_bahai_word_count)) print("Arabic vocabulary: {}".format(len(arabic_bahai_vocabulary))) print("Persian vocabulary: {}".format(len(persian_bahai_vocabulary))) print("intersection: {} ".format(len(bahai_intersection))) print('\n') with open('{}bahai_intersection.txt'.format(directory), 'w') as out_file: for w in bahai_intersection: out_file.write(w + '\n') with open('{}bahai_intersection_sample.txt'.format(directory), 'w') as out_file: sample = random.sample(list(bahai_intersection), round((len(bahai_intersection) / 100)) * 2) for w in sample: out_file.write(w + '\n') # Plot table data = [[4242, 15, 1927, 2], [30000, 2000, 20000, 22000]] columns = ("`Abdu'l-Bahá", "Báb", "Bahá'u'lláh", "Shoghi Effendi") rows = ["Sample", "Known works"] # values = np.arange(0, 2500, 500) # value_increment = 1000 colors = plt.cm.BuPu(np.linspace(0, 0.5, len(rows))) n_rows = len(data) index = np.arange(len(columns)) + 0.3 bar_width = 0.4 for row in range(n_rows): plt.bar(index, data[row], bar_width, bottom=y_offset, color=colors[row]) y_offset = y_offset + data[row] plot = plt.table(cellText=None, cellColours=None, cellLoc='right', colWidths=None, rowLabels=rows, rowColours=None, rowLoc='left', colLabels=columns, colColours=None, colLoc='center', loc='bottom', bbox=None, edges='closed', **kwargs) plot.show()
arabic_counter = Counter() for name in corpus: if 'mmha1' in name: mmha1.append(name) for file in mmha1: with open("{}{}".format(dir, name), 'r') as f, open('/Users/jtim/Desktop/out.txt', 'w') as out: out.write(file) out.write('\n') out.write("--------------------------------------") out.write('\n') out.write(f.read()) for file in mmha1: with open("{}{}".format(dir, name), 'r') as f, open('/Users/jtim/Desktop/out.txt', 'w') as out_c: words = Counter(process(f.read()).split()) arabic_counter.update(words) out_c.write(f.read()) out_c.write('\n') out_c.write("--------------------------------------") out_c.write('\n') out_c.write(str(words)) # print(len(mmha1)) # print(len(arabic_counter)) # print(arabic_counter.most_common(10000))
L = loads(L)[data][list] else: L = loads( L )[data] # f is a function used to generate data,content is extract data from news and desc is to extract news item self.list = map(f, L) #use map to enumerate self.content = content #save state self.desc = desc #save state def process( self ): #process have been changed to get the news , process function in helper.py then for url in self.list: #process it into format that bikeso can accept resultPage = fetch(url) result = loads(resultPage)[self.content] yield (result['title'], helper.process(result[self.desc])) #for test only if __name__ == '__main__': url = 'http://www.imxingzhe.com/api/v4/get_competitions?page=0&limit=500' def f(id): return 'http://www.imxingzhe.com/api/v4/competition_detail?competition_id=' + str( id['id']) tmp = AdvanceWeb(url, 'data', f, 'data', 'description') result = tmp.process() for i in result: print(process(BeautifulSoup(i, 'html.parser')))
data = json.loads(msg,strict=False) print (data) print('Incoming Email Loaded') except Exception as e: print ('Error while loading state....Exit-1 : ' + str(e)) sys.exit(1) #Fix String data Type text_body = data['text_body'].encode('ascii',errors = 'ignore').decode() subject = data['subject'].encode('ascii',errors = 'ignore').decode() frm = data['from']['name'] #Get Email Data structure and intent emails = fx.process(text_body,frm,subject) intent = fx.getIntent(emails) #Prepare the doc doc = {} doc['emails'] = emails doc['intent'] = intent if (intent.get('intent')): doc['request'] = req[intent['intent']] #update the collection result = collection.update_one({'caseid':data['caseid']}, {"$set":doc}, upsert=False) #send Message to reply queue url = "https://sd-ui.azurewebsites.net/task/"+data['caseid']
import helper as app if __name__ == '__main__': app.init() app.process() app.end()
def file_to_string(filenames): strings = [] for filename in filenames: with open(filename) as f: strings.append(process(process_kitab(f.read()))) return '\n'.join(strings)
parser.add_argument('--test_path', default='data/eval.iob') parser.add_argument('--patience', type=int, default=10) parser.add_argument('--number_normalized', type=bool, default=True) parser.add_argument('--use_crf', type=bool, default=True) args = parser.parse_args() use_gpu = torch.cuda.is_available() print('use_crf:', args.use_crf) if not os.path.exists(args.savedir): os.makedirs(args.savedir) test_path = '/content/test.iob' text = input("Enter the sentence :") print(text) process(text, test_path) eval_path = "evaluation" eval_temp = os.path.join(eval_path, "temp") if not os.path.exists(eval_temp): os.makedirs(eval_temp) test_pred_file = eval_temp + '/test_pred.txt' # Loading the vocabulary model_name = args.savedir + '/' + args.feature_extractor + str( args.use_char) + str(args.use_crf) word_vocab = WordVocabulary(args.train_path, args.dev_path, args.test_path, args.number_normalized) label_vocab = LabelVocabulary(args.train_path) alphabet = Alphabet(args.train_path, args.dev_path, args.test_path)