def banner(): while True: print "1. Press 1 for running the tokenizer usually" print "2. Press 2 for creating the inverted index" print "3. Press 3 for creating the vectors for the documents" print "4. Press any other number to search" choice = int(raw_input("$ ")) if not os.path.exists(TEXT): print "No Data at All. No Valid Corpus. Please add something to\n" + str(TEXT) if not os.path.exists(DATA_PATH): print "No Data Existed, Path Created" os.mkdir(DATA) if not os.path.exists(TOKENS) or choice == 1: print "Creating tokens as they don't exist/You Chose To" os.mkdir(TOKENS) tokenizer() if not os.path.exists(INDICES) or choice == 2: print "Creating indices as they don't exist" os.mkdir(INDICES) counter() if not os.path.exists(SCORES) or choice == 3: print "Creating Vectors as they don't exist" os.mkdir(SCORES) score() mod() if choice > 3 or choice == 0: print "Search begins" break
def main(): # Take command line arguments and check for 2 args args = sys.argv[1:] if (len(args) == 1): file_in = args[0] else: print("Invalid arguments") exit(-1) stopwords_fp = "../text/stopwords.txt" file_in = args[0] file_out = 'tokenized.txt' # Read text from input file to string input_file = open(file_in, 'r') stopwords_file = open(stopwords_fp, 'r') text = '' stopwords = '' for line in input_file: text = text + line for line in stopwords_file: stopwords = stopwords + line # Tokenize string tokenized_text = tokenizer.tokenizer(text, stopwords) # Output tokenized text to file_out output_file = open(file_out, 'w') for line in tokenized_text: output_file.write(line)
def main(): user_input = input("Please, type your Lisp-like function:") tokens = tokenizer(user_input) abs_syntax_tree = parser(tokens) new_ast = transformer(abs_syntax_tree) output = codeGenerator(new_ast) return output
def tokenStream(root, maxPages): urlDex = {} docTotal = 0 for page in selfCrawler.crawl(root, maxPages): docTotal += 1 raw = page.raw ID = page.ID url = page.url urlDex[ID] = url txtIDPair = {} print(ID) if raw != None: try: txtIDPair["TEXT"] = extractor.ConcordiaPageExtract(raw) except Exception: print(Exception, len(raw)) txtIDPair["TEXT"] = None print(ID, " : ", url) else: txtIDPair["TEXT"] = None if txtIDPair["TEXT"] == None: docLengths[ID] = 0 else: docLengths[ID] = len(txtIDPair['TEXT']) txtIDPair["ID"] = ID if txtIDPair["TEXT"] != None: docLengths[txtIDPair["ID"]] = len(txtIDPair["TEXT"]) for token in tokenizer.tokenizer(txtIDPair): token = (token[0], token[1].lower()) yield token with open("urls.json", 'w') as map: x = json.dumps(urlDex, sort_keys=True, indent=2) map.write(x)
def __init__(self, _config, _value): super().__init__(_config, _value) self.tokens = tokenizer(_value) self.value = (' '.join(self.tokens)).lower() self.n_words = len(self.tokens) self.n_graphemes = len(self.value) self.romanized = unidecode(self.value)
def main(): # Take command line arguments and check for 2 args args = sys.argv[1:] if (len(args) == 1): file_in = args[0] else: print("Invalid arguments") exit(-1) stopwords_fp = "../text/stopwords.txt" file_in = args[0] file_out = 'terms.txt' # Read text from input file to string input_file = open(file_in, 'r') stopwords_file = open(stopwords_fp, 'r') text = '' stopwords = '' for line in input_file: text = text + line for line in stopwords_file: stopwords = stopwords + line # Tokenize string tokenized_text = tokenizer.tokenizer(text, stopwords) # Generate and write list of top 200 most frequent terms top_200(tokenized_text, file_out) vocabulary_growth(tokenized_text)
def test_tokenizer(self): e1 = "to sentence here 12:30 3h 15min tags:tag1,tag3 , tag4" e2 = "plain todo" target = [ ("WORD", "to"), ("WORD", "sentence"), ("WORD", "here"), ("INTEGER", 12), ("TAG_MARKER", ":"), ("INTEGER", 30), ("INTEGER", 3), ("TIME_UNIT", "h"), ("INTEGER", 15), ("TIME_UNIT", "min"), ("WORD", "tags"), ("TAG_MARKER", ":"), ("WORD", "tag1"), ("WORD", "tag3"), ("WORD", "tag4"), ] tok = tokenizer(e1) self.assertEqual(tok.get_token_list(), target) self.assertEqual(tok.tag_marker_count, 2)
def runSingleFile(self, isWrite=True): tokens = tokenizer(self.content) engine = CompilationEngine(tokens) pdb.set_trace() if isWrite: with open(self.outputPath, 'w') as f: f.write(''.join(asmCmds))
def single_test(jack_path): with open(jack_path, 'r') as f: content = f.read() tokens = tokenizer(content) ce = CompilationEngine(tokens) exps = ce.expressions print(len(exps)) print(exps[0].show())
def inverse_index(data): d_map = defaultdict(list) for idx, val in enumerate(data): for word in tokenizer(val): d_map[word].append(idx) return d_map
def __init__(self, inputPath): with open(inputPath,'r') as f: content = f.read() self.tokens = tokenizer(content) self.curIdx = 0 self.XMLArr = [] self.compileClass() self.XML = ''.join(self.XMLArr)
def getSentiment(self,text): text_tokens = tokenizer(text) score = 0.0 for token in text_tokens: if token in self.sent_dict: score += self.sent_dict[token] if len(text_tokens) == 0: return 0 return score/(len(text_tokens)*5)
def __init__(self): self.sent_dict=dict() sent_file = open(SENTIMENT_FILE) for line in sent_file: term,score = line.split('\t') tokens = tokenizer(term) if len(tokens)>0: term_token = tokens[0] self.sent_dict[term_token] = float(score)
def parse_str(self, raw_string): """parse the given string and return a new entry instance""" self.tok = tokenizer(raw_string) # parse string and write the results in temp vars # read methods append the results n(to the coresponding vars while (True): t = self.tok.reveal_next_token() if t == None: break if t[0] == "INTEGER": con = self._read_time_duration() if con == True: continue if t[0] != "INTEGER" and t[1].upper() in ["TAGS", "TAG", "T"]: con = self._read_tags() if con == True: continue else: self.tm_count += 1; # read msg self.msg.append(t[1]) self.tok.consume_next_token() # build msg new_entry = entry() for word in self.msg: new_entry.msg += str(word) + " " new_entry.msg = new_entry.msg.strip() # build timedelta objects and sum them complete_duration = datetime.timedelta() for d in self.durations: if d[1] in HOUR_UNITS: cur_dur = datetime.timedelta(hours = d[0]) elif d[1] in MIN_UNITS: cur_dur = datetime.timedelta(minutes = d[0]) complete_duration += cur_dur new_entry.duration = complete_duration # build entry new_entry.tags = self.tags # reset the parser and return the new entry self.reset() return new_entry
def tokenize_file(filename: str): """ Tokenize words in a file """ line = ft.read_file(filename) words = ts.tokenizer(line) df = pd.DataFrame(FreqDist(words).items(), columns=['token', 'freq']) df = df.dropna() df['doc'] = filename df['n_words'] = len(words) df['tf'] = df['freq'] / df['n_words'] print(f"Tokenized - {filename}") return df
def __init__(self, name=None, eos='<eos>', sos='<sos>', unk='<unk>', pad='<pad>', tok_type='spacy', lower=True): self.tokenizer = tokenizer(tok_type) self.eos = eos self.sos = sos self.pad = pad self.unk = unk self.name = name self.lower = lower self.word2index = {self.sos: 0, self.eos: 1, self.pad: 2, self.unk: 3} self.word2count = {} self.index2word = {0: self.sos, 1: self.eos, 2: self.pad, 3: self.unk} self.n_words = 4 # Count SOS and EOS
def parse_documents(self): print('Parsing the documents and creating Hashmap index ...') tokenizer_obj = tokenizer() for i in range(len(self.all_documents)): self.doc_ids[i + 1] = self.all_documents[i] tokens = tokenizer_obj.parse(self.all_documents[i]) # Creating Index while parsing self.addto_hashmap_index(i + 1, tokens) print('Saving the term id and document id files...') # Files of term id and document id self.make_files_of_terms_and_documents()
def main(): parser = make_parser(prog="Twitter Tokenizer") args = parser.parse_args() # TODO: add function checks and exceptions module = load_module(args.writer) writer = module.writer module = load_module(args.reader) reader = module.reader() for tweet in reader: result = tokenizer(tweet, args.tokens, args.verbose) writer(result)
def main(argv): if len(argv) >= 2: with open(argv[1], 'r') as f: grammar = read_bnf(f.read()) print_bnf(grammar) if len(argv) >= 3: tks = list(tokenizer(argv[1])) for tk in tks: print tk print ast = parser(tks) print ast
def tokenizer_execute(language, page_html, link): obj = tokenizer.tokenizer(language) obj.generate_tokens() obj2 = tokenizer.semantic_tokenizer(obj.tokens) obj2.generate_tokens() obj3 = tokenizer.extractor(obj.tokens, obj2.semantic_tokens) if not page_html: returned_result = obj3.start_extract(link) return returned_result else: returned_result = obj3.start_extract_without_fetch(page_html) return returned_result
def handle_mentions(body, say): raw_message = body["event"]["text"] tokenized_message = tokenizer.tokenizer(raw_message) tokenized_message_types = [x.type for x in tokenized_message] # print(tokenized_message) if tokenized_message_types == ["USERNAME", "QR", "URL"]: target_url = tokenized_message[2].value say(blocks=[{ "type": "image", "title": { "type": "plain_text", "text": f"QR Code of {target_url}" }, "block_id": f"image-{target_url}", "image_url": QR_BASE_URL + target_url, "alt_text": "QR Code" }]) elif tokenized_message_types == ["USERNAME", "OMIKUJI"]: with open("omikuji_result.json") as f: omikuji_result = json.load(f)["omikuji"] chose_result = random.choice(omikuji_result) say(blocks=[{ "type": "image", "title": { "type": "plain_text", "text": chose_result["text"] }, "block_id": "image", "image_url": chose_result["image"], "alt_text": "Image " + chose_result["text"] }]) elif tokenized_message_types == ["USERNAME", "GAKUSEKI", "STUDENT_ID"]: students = member_list.get_members() student_id = tokenized_message[2].value the_student = students.get(student_id) if the_student is None: say("Not found.") else: s_email = the_student["email"] s_real_name = the_student["real_name"] say(blocks=[{ "type": "section", "fields": [{ "type": "mrkdwn", "text": f":e-mail:*Email:*\n{s_email}", }, { "type": "mrkdwn", "text": f":pencil:*Real Name:*\n{s_real_name}", }] }])
def process_file(fileName, mine_type): worker = tokenizer(fileName) while True: word = worker.nextWord() if word == None: break log('# ' + word) if mine_type == cool_mine: if knowndb.find(word) == False: unknowndb.add(word) else: knowndb.add(word)
def calculator(): while True: expr = get_input() tokens = tokenizer(expr) parentheses = [] for tok in tokens: if tok == "(" or tok == ")": parentheses.append(tok) if not paren_check("".join(parentheses)): print("Unmatched parentheses") if tokens is not None or len(tokens) > 0: postfix = infixtopostfix(tokens) evaluation = postfixevaluation(postfix) print("press CTRL + C to quit, else press any key to continue")
def search(search_query): """ Searches for normalized words in the search query in tokens table and returns the doc name in the descending order of sum(freq) of query words""" keywords = "','".join(ts.tokenizer(search_query)) db_conn = db.create_db_conn("MSSQL") tblname = f"{db.DBNAME}.{db.SCHEMA}.tokens" sql = f""\ f"SELECT doc, sum(freq) "\ f"FROM {tblname} "\ f"where token in ('{keywords}') "\ f"group by doc "\ f"order by 2 desc" print(sql) df = pd.read_sql_query(sql, db_conn) return (df)
def main(argv): print('Sedna Parser') if len(argv) < 2: print('command line syntax: <input-source> ...') return print('---start---') for inp in argv[1:]: fd = open(inp, 'rb') data = fd.read().decode('utf-8') fd.close() ast = tokenizer.tokenizer(data) print(json.dumps(ast)) '''
def matches(self, term): """Given a term, returns a list of postings for that term and its synonyms. We find synonyms for the given term, obtain their postings, and merge them with the \"original\" list of postings for the term itself.""" # Obtain the postings list for this term. term_postings = \ set(self.compound_index.postings_list(self.INDEX, term)) # Convert the postings list into a dictionary, # where keys are doc IDs and values are term counts. posting_dict = {} for posting in term_postings: posting_dict[posting[0]] = posting[1] # Find synonyms of the term from our thesaurus. thesaurus = Thesaurus() unstemmed = self.stemmed_unstemmed_map(self.INDEX)[term] synonyms = thesaurus[unstemmed] for synonym in synonyms: # Get the postings for this synonym. stemmed_synonym = tokenizer(synonym)[0] postings = self.compound_index.postings_list( self.INDEX, stemmed_synonym) # Update the existing postings with each synonym's count. for posting in postings: doc_id = posting[0] count = posting[1] if doc_id in posting_dict: posting_dict[doc_id] += count else: posting_dict[doc_id] = count # Convert the posting dictionary back into a posting list. combined_postings = [] for doc_id, count in posting_dict.iteritems(): combined_postings.append([doc_id, count]) return sorted(combined_postings)
def run(sq): url_list =[ "https://isha.sadhguru.org/us/en/wisdom/article/what-to-eat-making-right-food-choices", "https://www.pythonforbeginners.com/basics/getting-user-input-from-the-keyboard", "https://medium.com/center-for-data-science/deepmind-fellow-profile-ksenia-saenko-e6d0f7574a59", "https://medium.com/center-for-data-science/deepmind-fellow-profile-yassine-kadiri-7bfe4a045050" ] data = inverse_index(hit_urls(url_list)) # update main map with words from the html pages, with their occurrences MAIN_MAP.update(data) query = tokenizer(sq) root = Node() ignore = ['©', '—', '’', '“', '”', "''"] for word in MAIN_MAP: if word not in ignore: add(root, word) retval = {} # search the compressed trie using the find function for key in query: if find(root, key): retval.update({key: MAIN_MAP[key]}) resulting_idx = ranking(retval) if not resulting_idx: print(f'\n No results for your search query - {sq}') print('\n Modify the query and try again, listed below are the searched URLs') for idx, ul in enumerate(url_list): print(f'{idx+1}.{ul}') return print("\n Search results, in decreasing order of relevance \n") for idx, val in enumerate(resulting_idx): print(f'{idx+1}: {url_list[val]}')
def get_tokens_for(self, index, unstemmed=False): """Given an index (title or abstract), returns a list of tokens from the words contained in that index. By default, this will return case-folded and stemmed tokens. If the unstemmed argument is set to True, the original words will be returned instead.""" raw_text = self.__text.get(index) if unstemmed: # We need to return just the words, without any punctuation. words = raw_text.split() stripped = [x.strip(string.punctuation) for x in words] return stripped tokens = tokenizer(raw_text) # We want to strip out tokens which consist of just # punctuation characters. return [x for x in tokens if x not in string.punctuation]
def solve(equation): expression = tokenizer(equation) left_side = right_side = None comp = None for i, token in enumerate(expression): if token.type == 'Comparator': left_side = build(expression[:i]) right_side = build(expression[i + 1:]) comp = token.value break if comp is None: expression = build(expression) print(f'The answer is {solve_side(expression)}') else: if comp == '=': comp = '==' result = eval( f'{solve_side(left_side)} {comp} {solve_side(right_side)}') print(f'The equation is {result}')
def NBtesting(doc_string): current_file_path = os.path.dirname(os.path.abspath(__file__)) # total document number # documentCount = len(testing_docs) # result is a dictionary, used to store the testing result. # i.e `result` = { doc_1: class_of_doc_1, doc_2: class_of_doc_2,...} result = dict() classes = ["1", "2", "3", "5"] # (new_V, prior, condprob) # condprob[t][_class] term,class,prob f_v = open(current_file_path+"/training_result/v.txt", "r") v = f_v.read().decode("utf-8").split(",") f_prior = open(current_file_path+"/training_result/prior.txt", "r") prior = dict() for row in csv.DictReader(f_prior): prior[row["class"]]=float(row["prob"]) f_condprob = open(current_file_path+"/training_result/condprob.txt", "r") condprob = dict() for row in csv.DictReader(f_condprob): term = row["term"].decode("utf-8") _class = row["class"] prob = float(row["prob"]) if term not in condprob: condprob[term]=dict() condprob[term][_class] = prob doc_terms = tokenizer.tokenizer(doc_string) result = naive_bayes.ApplyMultinomialNB(classes, v, prior, condprob, doc_terms) return result
def parse_input(self): assign_token = [] assign_value = [] input_lines = input(">>: ") # 3,4 input_tokens = tokenizer(input_lines) for token in input_tokens: if not token[1] == "comma": if token[1] == 'bool': val = (token[0].replace("\"", ""), token[1]) token = val assign_value.append(token) i = 0 assign_token.append(self.current_token) assign_token.append(("=", "assignment")) assign_token.append(assign_value[i]) assign_token.append(("INPUT", "INPUT")) self.keep("identifier") i += 1 while self.current_token[1] == 'comma': self.keep("comma") assign_token.append(self.current_token) assign_token.append(("=", "assignment")) self.keep("identifier") if assign_value[i][1] != 'EOF': assign_token.append(assign_value[i]) assign_token.append(("INPUT", "INPUT")) else: self.error("\nExpected more inputs") i += 1 if (len(assign_value) - 1) != i: # input values is greater than identifiers self.error("\nExpected less inputs") for i in range(0, len(assign_token), 4): input_assign = Parser(assign_token[i:i + 4]) input_assign.parse_assign()
def batch_test(): for jack_path, target_path in [ [ '../test/ArrayTest/Main.jack', '../test/engine_test/array_main_actual.xml' ], [ '../test/Square/Main.jack', '../test/engine_test/square_main_actual.xml' ], [ '../test/Square/Square.jack', '../test/engine_test/square_actual.xml' ], [ '../test/Square/SquareGame.jack', '../test/engine_test/square_game_actual.xml' ], [ '../test/ExpressionLessSquare/Main.jack', '../test/engine_test/exp_main_actual.xml' ], [ '../test/ExpressionLessSquare/Square.jack', '../test/engine_test/exp_actual.xml' ], [ '../test/ExpressionLessSquare/SquareGame.jack', '../test/engine_test/exp_game_actual.xml' ] ]: print(jack_path) print(target_path) with open(jack_path, 'r') as f: content = f.read() tokens = tokenizer(content) CompilationEngine(tokens).treeToXml(target_path)
import unicodedata import nltk import enchant import os import tokenizer import metaphone import plausibleWords import dictionarySearch import getngrams splitSentencesArray = tokenizer.tokenizer() incorrectlySpelled = dictionarySearch.dictionarySearch(splitSentencesArray) correctlySpelled = [] summationArray = [] outputSentencesArray = splitSentencesArray tempSum = 0.0 for i in xrange(len(incorrectlySpelled)): plausibleList = plausibleWords.plausibleWords(splitSentencesArray[incorrectlySpelled[i][0]][incorrectlySpelled[i][1]]) for j in xrange(len(plausibleList)): if incorrectlySpelled[i][1] != 0 and incorrectlySpelled[i][1] != (len(splitSentencesArray[incorrectlySpelled[i][0]]) - 1): testString1 = outputSentencesArray[incorrectlySpelled[i][0]][incorrectlySpelled[i][1]-1] + plausibleList[j] + outputSentencesArray[incorrectlySpelled[i][0]][incorrectlySpelled[i][1]+1] testString = outputSentencesArray[incorrectlySpelled[i][0]][incorrectlySpelled[i][1]-1] + " " + plausibleList[j] + " " + outputSentencesArray[incorrectlySpelled[i][0]][incorrectlySpelled[i][1]+1] elif incorrectlySpelled[i][1] == 0: testString1 = plausibleList[j] + outputSentencesArray[incorrectlySpelled[i][0]][incorrectlySpelled[i][1]+1] testString = plausibleList[j] + " " + outputSentencesArray[incorrectlySpelled[i][0]][incorrectlySpelled[i][1]+1] else: testString1 = outputSentencesArray[incorrectlySpelled[i][0]][incorrectlySpelled[i][1]-1] + plausibleList[j] testString = outputSentencesArray[incorrectlySpelled[i][0]][incorrectlySpelled[i][1]-1] + " " + plausibleList[j] os.system("getngrams.py " + testString + " -noprint -quit")
def preProcess(texto, linguagem): return stemming.stemmer( removalStopwords.removalStopwords(tokenizer.tokenizer(texto), linguagem), linguagem)
def _is_jack_file(file): return file.split('.')[1] == 'jack' def _get_jack_files(arg): if _is_file(arg): files = [arg] else: content_in_directory = \ ['{}/'.format(arg) + content for content in os.listdir(arg)] files = [file for file in content_in_directory if _is_file(file)] return (file for file in files if _is_jack_file(file)) def _get_output_filename(arg): return '{}.vm'.format(arg.split('.jack')[0]) ##################--------- DRIVER CODE -------###################### # NOQA if __name__ != '__main__': print 'Please run as a self-conatined program' jack_files = _get_jack_files(sys.argv[1]) for file in jack_files: token_gen = tokenizer(file) output_file = open(_get_output_filename(file), 'w') compile_file(output_file, token_gen)
def pass0(self): #read input file and parse it (also invoke preprocesor) self.t = tokenizer.tokenizer() self.t.parse(self.mainFileName)
return "-" def binary_op(): #print "Parsing bin op with %s" % sym if sym.type == "+": return "+" elif sym.type == "-": return "-" elif sym.type == "*": return "*" elif sym.type == "/": return "/" if __name__ == '__main__': import sys line = sys.stdin.read() t = tokenizer()(line) def f(): global sym try: sym = map_sym(t.next()) #print "Sym = %s" % sym except StopIteration: return getsym = f getsym() print expect(expr())
elif args.dataset == 'covid': folder_name = 'covid/' pos_weight = torch.tensor((290726 - 405) / 290726, dtype=torch.float32) # pos_weight = torch.tensor(1.0, dtype=torch.float32) valid = readData(folder_name + 'dev.csv') test = readData(folder_name + 'test.csv') train = readData(folder_name + 'train.csv') elif args.dataset == 'for_submission': folder_name = 'for_submission/' pos_weight = torch.tensor((2335 - 120) / 120, dtype=torch.float32) # pos_weight = torch.tensor(1.0, dtype=torch.float32) valid = readData(folder_name + 'dev.csv') test = readData(folder_name + 'train.csv') train = readData(folder_name + 'train.csv') tk = tokenizer(train + valid + test) # def make_batch(data): # X = [] # Y = [] # lengths = [] # weights = [] # for d in data: # x, x_len = tk.tokenize(d[0]) # y = d[1] # Y.append(float(y)) # # X.append(strToLong(x, char2int, max_length)) # X.append(x) # lengths.append(x_len) # X = np.stack(X, axis=0) # Y = np.array(Y)
# Add the next word to the sequence sequence += "{} ".format(next_state.split()[0]) curr_state = next_state lower_bound = 0.0 l += 1 # Ensures the sentence generated ends appropriately if l >= k and next_state[len(next_state) - 1] is '.': end = True return sequence if __name__ == '__main__': # Command line args conditionals if len(sys.argv) >= 3: filename = sys.argv[1] N = int(sys.argv[2]) elif len(sys.argv) == 2: filename = sys.argv[1] N = 1 else: filename = "markov_test_1.txt" N = 1 begin_state = "" directory = "text_files/" + filename # Deque object used to store n-token states tokens = tokenizer.tokenizer(directory) markovchain, begin_state = create_map(tokens, N) #pairs = markovchain.get_pairs() #for i in range(50): #print("{} contains {}".format(pairs[i][0], pairs[i][1])) generated_string = generate_sequence(markovchain, None, 25) print(generated_string)
def test_reveal_offset(self): e1 = "to sentence here 12:30 3h 15min tags:tag1,tag3 , tag4" tok = tokenizer(e1) self.assertEqual(tok.reveal_next_token(1), ("WORD", "sentence"))
i, j[0], k[0], )) for x in range(1, l): print('%20s %20s %20s' % ( "", "", k[x], )) if __name__ == "__main__": tokens = tokenizer.tokens table = {k: [0, set()] for k in tokens} lexer = lex.lex(module=tokenizer()) filename = sys.argv[1] if os.path.exists(filename): file = open(filename, 'r') data = file.read() lexer.input(data) while True: tokk = lexer.token() if not tokk: break table[tokk.type][0] += 1 table[tokk.type][1].add(tokk.value) file.close() else: print("File Does Not Exist") Print()
from sklearn.multiclass import OneVsRestClassifier from sklearn.preprocessing import LabelEncoder from keras.models import Sequential from keras.layers import Dense, Dropout from keras.wrappers.scikit_learn import KerasClassifier from keras.utils import np_utils import pandas as pd import json import tokenizer import dummytokenize import pickle t = tokenizer.tokenizer() class create_CNN(): def __init__(self, X, Y, config, tokenizer=None): self.config_dict = json.load(open(config)) self.lb = LabelEncoder() self.tfidf = TfidfVectorizer(tokenizer=tokenizer, preprocessor=tokenizer, token_pattern=None) self.x = X self.y = Y def fit_tfidf(self): print(self.x)
logging.basicConfig(format='[%(asctime)s.%(msecs)03d] %(levelname)s %(message)s', datefmt='%Y-%m-%d_%H:%M:%S', level=numeric_level) logging.info('Created Logger level={}'.format(loglevel)) else: logging.basicConfig(filename=logfile, format='[%(asctime)s.%(msecs)03d] %(levelname)s %(message)s', datefmt='%Y-%m-%d_%H:%M:%S', level=numeric_level) logging.info('Created Logger level={} file={}'.format(loglevel, logfile)) ################################################ ### MAIN ####################################### ################################################ if __name__ == '__main__': fin = None fout = None num_threads = 1 t = tokenizer() usage = """usage: {} [-i FILE -o FILE -num_threads INT] [tok_options] -i: (stdin) -o: (stdout) -num_threads: 1 (used only when -i and -o are used) -h: this message tok_options (See https://github.com/OpenNMT/Tokenizer for more details): """.format(sys.argv.pop(0),t.tokopts) for k,v in t.tokopts.items(): usage += " -{}: {}\n".format(k,v) sys.argv = t.updateOpts(sys.argv) while len(sys.argv): tok = sys.argv.pop(0)
def _is_jack_file(file): return file.split('.')[1] == 'jack' def _get_jack_files(arg): if _is_file(arg): files = [arg] else: content_in_directory = \ ['{}/'.format(arg) + content for content in os.listdir(arg)] files = [file for file in content_in_directory if _is_file(file)] return (file for file in files if _is_jack_file(file)) def _get_output_filename(arg): return '{}_output.xml'.format(arg.split('.jack')[0]) ##################--------- DRIVER CODE -------###################### # NOQA if __name__ != '__main__': print 'Please run as a self-conatined program' jack_files = _get_jack_files(sys.argv[1]) for file in jack_files: token_gen = tokenizer(file) output_file = open(_get_output_filename(file), 'w') compile_file(output_file, token_gen)
stateOutput = open('states'+args.output+'.csv','w') inputcsv = csv.reader(csvInput) citycsv = csv.writer(cityOutput) statecsv = csv.writer(stateOutput) cityDict = dict() stateDict = dict() cityMeanDict = dict() stateMeanDict = dict() for row in inputcsv: if args.output == 'Yelp': [sentiment,city,state] = row if state not in states: continue elif args.output == 'Zagat': [sentiment,state,city] = row city = ' '.join(tokenizer(city)) city = city+':'+state sentiment = float(sentiment) if city in cityDict: crrnt = cityDict[city] cityDict[city] = [crrnt[0]+sentiment,crrnt[1]+1] else: cityDict[city] = [sentiment,1] if state in stateDict: crrnt = stateDict[state] stateDict[state] = [crrnt[0]+sentiment,crrnt[1]+1.0] else: stateDict[state] = [sentiment,1.0] for k in cityDict: v = cityDict[k] mean = v[0]/v[1]