def main(): if 0: print_unknown_modules() print_unknown_functions() for v in get_objects(): if type(v) is Map: if '__module__' in v: line('%d: Map.__module__: %s', reference_count(v), v['__module__']) continue if '__doc__' in v: line('%d: Map.__doc__: %r', reference_count(v), v['__doc__']) continue line('%d: Map.keys: %s', reference_count(v), v.keys()) def find_object_by_address(address): for v in get_objects(): if address_of(v) == address: return v from Tokenizer import tokenizer tokenizer()
def preprocess(): blog_en, blog_cn = [], [] t = tokenizer() with open('UM_Corpus/Bi-Microblog.txt', 'rU') as f: lines = f.readlines() for i in xrange(0, len(lines), 2): en = t.tokenize(lines[i].strip()) cn = t.tokenize(lines[i + 1].strip()) if en is None or cn is None: continue blog_en.append(en) blog_cn.append(cn) news_en, news_cn = [], [] with open('UM_Corpus/Bi-News.txt', 'rU') as f: lines = f.readlines() for i in xrange(0, len(lines), 2): en = t.tokenize(lines[i].strip()) cn = t.tokenize(lines[i + 1].strip()) if en is None or cn is None: continue news_en.append(en) news_cn.append(cn) with open('UM_Corpus/blog_en', 'w') as f1, open('UM_Corpus/blog_cn', 'w') as f2, open( 'UM_Corpus/news_en', 'w') as f3, open('UM_Corpus/news_cn', 'w') as f4: json.dump(blog_en, f1) json.dump(blog_cn, f2) json.dump(news_en, f3) json.dump(news_cn, f4)
def embed(self, text1, text2): p = tokenizer() text1 = [p.tokenize(t, cn=False) for t in text1] text2 = [p.tokenize(t, cn=False) for t in text2] feats1, feats2 = encode_sentences(self.model, (text1, text2), test=True) return feats1, feats2
def calculator(): while True: expr = readInput() tokens = tokenizer(expr) parens = [] for t in tokens: if t == '(' or t == ')': parens.append(t) if not parenChecker(''.join(parens)): print ("Parenthesis mismatch. Please try again..") continue if (tokens is not None) or (len(tokens) >= 1): postfix = infixToPostfix(tokens) print ("postfix: ", postfix) result = postfixEvaluator(postfix) print ("ANS: ", result) print ("press any key to continue, \'n\' to exit.") ch = input() if ch == 'n': break
def calculator(): while True: expr = readInput() tokens = tokenizer(expr) parens = [] for t in tokens: if t == '(' or t == ')': parens.append(t) if not parenChecker(''.join(parens)): print("Parenthesis mismatch. Please try again..") continue if (tokens is not None) or (len(tokens) >= 1): postfix = infixToPostfix(tokens) print("postfix: ", postfix) result = postfixEvaluator(postfix) print("ANS: ", result) print("press any key to continue, \'n\' to exit.") ch = input() if ch == 'n': break
def main(): # Attributes filename = 'Dataset.csv' # Reading the dataset intents, uniqueIntents, sentences = readDataset(filename) # Cleaning the sentences and tokenizing cleanedWords = cleaningSentences(sentences) # Indexing wordTokenizer = tokenizer(cleanedWords) # Maximum set of words length = maxLength(cleanedWords) # Prediction text = 'What to do if my business category is not in the options?' pred = predictions(text, wordTokenizer, length) # Getting final output getSetOfIntents(pred, uniqueIntents)
def create_index(self): corpus_data = json.load(open(self.corpus_html), encoding='utf-8') # load the data print('Start to parse...') num = 1 for (doc_id, url) in corpus_data.items( ): # 0/19 : www.ics.uci.edu........ 0/19 means the url is in the folder named 0 and the file named 19 inside the folder id_info = doc_id.split('/') folder_id = id_info[0] file_id = id_info[1] file_name = "{}/{}/{}".format("WEBPAGES_RAW", folder_id, file_id) html = open(file_name, 'r', encoding='utf-8') soup = BeautifulSoup(html, 'lxml') text_info = soup.findAll(text=True) for text in text_info: if (text.parent.name not in [ 'style', 'script', '[document]', 'meta' ]) and (not isinstance(text, Comment)): self.token_tf_dict = tokenizer(text.strip()) token_dict_items = self.token_tf_dict.items() if text.parent.name in [ "head", "title", "bold" ]: # if the term is in some more "important" tag, the term frequency for (token, frequency) in token_dict_items: # self.token_dict[token] = frequency*2 self.inverted_index_tf[token][ doc_id] = frequency * 2 # Notice that self.inverted_index is a dict of dict # if self.token_dict[token] == []: # self.token_dict[token].append(frequency * 2) # self.token_dict[token].append(text.parent.name) # else: # self.token_dict[token].append(frequency*2) # self.token_dict[token].append(text.parent.name) else: for (token, frequency) in token_dict_items: # self.token_dict[token] = frequency #print(token, frequency) self.inverted_index_tf[token][ doc_id] = frequency # Notice that self.inverted_index is a dict of dict # if self.token_dict[token] == []: # self.token_dict[token].append(frequency) # self.token_dict[token].append(text.parent.name) # else: # self.token_dict[token].append(frequency) # self.token_dict[token].append(text.parent.name) # for (term, tf) in token_dict_items: # self.inverted_index_tf[term][doc_id] = tf # Notice that self.inverted_index is a dict of dict print("Starting...." + str(num)) num += 1 for term in self.inverted_index_tf.keys(): df = len(self.inverted_index_tf[term]) idf = math.log10(self.total_num_of_doc / df) for docid in self.inverted_index_tf[term].keys(): weighted_tf = float( 1 + math.log10(self.inverted_index_tf[term][docid])) tf_idf_score = weighted_tf * idf self.inverted_index[term][docid] = [] self.inverted_index[term][docid].append(weighted_tf) self.inverted_index[term][docid].append(tf_idf_score) #print(self.inverted_index[term][docid]) self.document_length[docid] += math.pow( self.inverted_index[term][docid][0], 2) for doc in self.document_length.keys(): self.document_length[doc] = math.sqrt(self.document_length[doc]) # write inverted_index dict into pandas pickle file index_storage = pd.Series(self.inverted_index) index_storage.to_pickle("inverted_index__final_file.pkl") # write document_length dict into pandas pickle file document_storage = pd.Series(self.document_length) document_storage.to_pickle("document_length__final_file.pkl")
def query_process(inverted_index, document_length, query): tokenized_query = tokenizer(query) if len(tokenized_query) == 1: query_dict = dict() try: token = list(tokenized_query.keys())[0] #print(token) #print(inverted_index['irvine']) query_dict = inverted_index[token] #print('query dict : ') #print(inverted_index['irvine']) #query_dict = url_tfidf_dict #for info in url_tfidf_dict: #query_dict[info] = url_tfidf_dict[info][1] except: pass # Only sort if more than 1 url returned if len(query_dict.items()) > 1: query_result = sorted(list(query_dict.items()), key=lambda x: x[1], reverse=True) # else: # query_result = sorted(list(query_dict.items()), key = lambda x: x[1], reverse = True) return query_result else: multi_query_dict = defaultdict(float) try: query_normalized_tfidf_dict = defaultdict(float) query_length_square = 0 for token in tokenized_query.keys(): tf_weight = 1 + math.log10(tokenized_query[token]) idf = math.log10( len(document_length) / len(inverted_index[token])) tf_idf = tf_weight * idf query_normalized_tfidf_dict[token] = tf_idf query_length_square += math.pow(tf_idf, 2) #normalized_tf_in_query = tf_idf / query_length query_length = math.sqrt(query_length_square) for token in query_normalized_tfidf_dict.keys(): query_normalized_tfidf_dict[ token] = query_normalized_tfidf_dict[token] / query_length for token in query_normalized_tfidf_dict.keys(): doc_dict = inverted_index[token] for doc in doc_dict.keys(): normalized_tf_in_doc = inverted_index[token][doc][ 0] / document_length[doc] multi_query_dict[doc] += query_normalized_tfidf_dict[ token] * normalized_tf_in_doc except: pass # Only sort if more than 1 url returned if len(multi_query_dict.items()) > 1: multi_query_result = sorted(list(multi_query_dict.items()), key=lambda x: x[1], reverse=True) # else: # Removed because we will allow more than 20 urls # multi_query_result = sorted(list(multi_query_dict.items()), key = lambda x: x[1], reverse = True) return multi_query_result
def main(): # Attributes filename = 'Dataset.csv' # Reading the dataset intents, uniqueIntents, sentences = readDataset(filename) # Cleaning the sentences and tokenizing cleanedWords = cleaningSentences(sentences) # Indexing wordTokenizer = tokenizer(cleanedWords) vocabSize = len(wordTokenizer.word_index) + 1 length = maxLength(cleanedWords) # Encoding the sentences encodedDoc = encodingDoc(wordTokenizer, cleanedWords) # Making equal length paddedDoc = paddindDoc(encodedDoc, length) # For intents # Tokenizer with filter changed outputTokenizer = tokenizer(uniqueIntents, filters='!"#$%&()*+,-/:;<=>?@[\]^`{|}~') # Encodeing the intents with unique intents encodedOutput = encodingDoc(outputTokenizer, intents) # Creating a array for each intent encodedOutput = np.array(encodedOutput).reshape(len(encodedOutput), 1) # One hot encoding (This creates 2D array with columns=Unique intents, rows=intents) outputOneHot = oneHotEncoder(encodedOutput) # Now dataset cleaning is finished!!!! # Splitting the dataset trainX, valX, trainY, valY = train_test_split(paddedDoc, outputOneHot, shuffle=True, test_size=0.2) # Model creation model = createModel(vocabSize, length) # Checking model (Layers) model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"]) model.summary() # Start model training filename = 'model.h5' checkpoint = ModelCheckpoint(filename, monitor='val_loss', verbose=1, save_best_only=True, mode='min') hist = model.fit(trainX, trainY, epochs=200, batch_size=16, validation_data=(valX, valY), callbacks=[checkpoint])
import json from Tokenizer import tokenizer p = tokenizer() # # with open('google_results.txt','w') as f: # json.dump(google, f) # with open('baidu_results.txt') as f: # baidu = json.load(f) # # for id in baidu: # for s in baidu[id]: # s['title'] = p.tokenize(s['title']) # # with open('baidu_results.txt', 'w') as f: # json.dump(baidu, f) def num(): with open('baidu_results.txt') as f: baidu = json.load(f) with open('google_results.txt') as f: google = json.load(f) with open('dataset.txt') as f: twitter = json.load(f) events = [