def app(): file = open('data.json', 'w') json_string = '{"name":"' conversation_name = raw_input("Enter a name for this conversation: ") description = raw_input("Enter a description for this conversation: ") json_string += conversation_name json_string += '", ' json_string += '"intents":[' #Get intent and training data from user json_string += get_intent() #Use AlchemyAPI to get keywords so that users can get an idea for what kind of entities they should create get_analysis = raw_input( "Would you like to use a few sentences to help you come up with entities? (Y/N) " ) if get_analysis == 'y' or get_analysis == 'Y': get_keywords() #Users create entities and values, and then get synonyms to those values json_string += '"entities":[' json_string += get_entities() json_string += '"language:":"en","metadata":null,"description":"' json_string += description json_string += '",' json_string += '"dialog_nodes":[]}' json_obj = json.dumps(json_string) file.write(json_string)
def get_definition(self): """ Returns the definitions of a the path under the cursor. This is not a goto function! This follows complicated paths and returns the end, not the first definition. :return: list of Definition objects, which are basically scopes. :rtype: list """ def resolve_import_paths(scopes): for s in scopes.copy(): if isinstance(s, imports.ImportPath): scopes.remove(s) scopes.update(resolve_import_paths(set(s.follow()))) return scopes goto_path = self.module.get_path_under_cursor() context = self.module.get_context() if next(context) in ('class', 'def'): scopes = set([self.module.parser.user_scope]) elif not goto_path: op = self.module.get_operator_under_cursor() scopes = set([keywords.get_operator(op, self.pos)] if op else []) else: scopes = set(self._prepare_goto(goto_path)) scopes = resolve_import_paths(scopes) # add keywords scopes |= keywords.get_keywords(string=goto_path, pos=self.pos) d = set([Definition(s) for s in scopes]) return sorted(d, key=lambda x: (x.module_path, x.start_pos))
def process_file(file_path, tagger, idf_doc_count, idf_table, threshold, maximum_words): """ Takes the uploaded file, detecs its type (plain text, alto XML, zip) and calls a parsing function accordingly. If everything succeeds it returns keywords and 200 code, returns an error otherwise. """ file_info = magic.from_file(file_path) lines = [] if re.match("^UTF-8 Unicode (with BOM) text", file_info): lines = lines_from_txt_file(file_path, encoding='utf-8-sig') elif re.match("^UTF-8 Unicode", file_info): lines = lines_from_txt_file(file_path, encoding='utf-8') elif re.match("^ASCII text", file_info): lines = lines_from_txt_file(file_path, encoding='utf-8') elif re.match('^XML 1.0 document', file_info) and \ (file_path.endswith('.alto') or file_path.endswith('.xml')): lines = lines_from_alto_file(file_path) elif re.match('^Zip archive data', file_info): lines = lines_from_zip_file(file_path) else: return {"eror": "Unsupported file type: {}".format(file_info)}, 400 if not lines: return {"error": "Empty file"}, 400 return keywords.get_keywords(lines, tagger, idf_doc_count, idf_table, threshold, maximum_words), 200
def on_get(self, req, resp, start, end): """All tracked keywords in the database. Returns a sorted list with the keywords and their counts. Takes the "group" GET parameters for the keyword group. """ global KEYWORDS, keywords_sync_time if (time() - keywords_sync_time) > 60 * 60: KEYWORDS = get_keywords(local=True) keywords_sync_time = time() query = { "num_keywords": {"$gt": 0}, "datetime": {"$gte": start, "$lt": end} } group = req.get_param("group") if group: del query["num_keywords"] query["groups"] = group tw = tweets.find(query, projection={"keywords": True, "_id": False}) counts = Counter() for t in tw: kws = t["keywords"] if group: keywords = [] for kw in kws: if kw in KEYWORDS: if group in KEYWORDS[kw].groups: keywords.append(kw) kws = keywords counts.update(kws) data = [{"keyword": kw, "count": c} for kw, c in counts.most_common()] resp.body = json.dumps(data)
def answer_question(teacher_chat_id, parent_chat_id, answer): with connection.cursor() as cursor: query = "SELECT * FROM parentsQuestions" cursor.execute(query) result = cursor.fetchone() if result is not None: ques = "'" + result['question'] + "'" lques = [result['question']] keywords = get_keywords(lques) query2 = f"INSERT INTO QA VALUES({result['chat_id']}, '{answer}', {ques}, '{keywords}')" cursor.execute(query2) query3 = f"DELETE FROM parentsQuestions WHERE question = {ques}" cursor.execute(query3) connection.commit() requests.get( TELEGRAM_SEND_MESSAGE_URL.format( TOKEN, parent_chat_id, "The teacher says:\n" + answer)) requests.get( TELEGRAM_SEND_MESSAGE_URL.format( TOKEN, teacher_chat_id, "Is this a general or a private question?" "(general/ private)")) else: requests.get( TELEGRAM_SEND_MESSAGE_URL.format( TOKEN, teacher_chat_id, "There are no questions to answer! " "Thank you and have a nice day! :) ")) return ques
def complete(self): """ An auto completer for python files. :return: list of Completion objects. :rtype: list """ path = self.module.get_path_until_cursor() path, dot, like = self._get_completion_parts(path) try: scopes = list(self._prepare_goto(path, True)) except NotFoundError: scopes = [] scope_generator = evaluate.get_names_for_scope( self.parser.user_scope, self.pos) completions = [] for scope, name_list in scope_generator: for c in name_list: completions.append((c, scope)) else: completions = [] debug.dbg('possible scopes', scopes) for s in scopes: # TODO is this really the right way? just ignore the funcs? \ # do the magic functions first? and then recheck here? if not isinstance(s, evaluate.Function): if isinstance(s, imports.ImportPath): names = s.get_defined_names(on_import_stmt=True) else: names = s.get_defined_names() for c in names: completions.append((c, s)) if not dot: # named_params have no dots call_def = self.get_in_function_call() if call_def: if not call_def.module.is_builtin(): for p in call_def.params: completions.append((p.get_name(), p)) # Do the completion if there is no path before and no import stmt. if (not scopes or not isinstance(scopes[0], imports.ImportPath)) \ and not path: # add keywords bs = builtin.Builtin.scope completions += ((k, bs) for k in keywords.get_keywords( all=True)) completions = [(c, s) for c, s in completions if settings.case_insensitive_completion and c.names[-1].lower().startswith(like.lower()) or c.names[-1].startswith(like)] needs_dot = not dot and path completions = set(completions) c = [Completion(c, needs_dot, len(like), s) for c, s in completions] return c
def build_model005(df): from keywords import get_keywords x_cols = ['salary_min', 'salary_max', 'title', 'abstract'] # print('!' * 80) # no_min = df['salary_min'].isnull() # no_min_max = df['salary_max'][no_min] # print('no_min_max') # print(no_min_max) # no_max = df['salary_max'].isnull() # no_max_min = df['salary_min'][no_max] # print('no_max_min') # print(no_max_min) has_minmax = df['salary_min'].notnull() & df['salary_max'].notnull() df = df[has_minmax] df_train, df_test = split_train_test(df) X, y = getXy(df_train, x_cols) X_test, _ = getXy(df_test, x_cols) # X.dropna(how='all', inplace=True) keywords = get_keywords(50) X = add_keywords(X, 'title', keywords['title']) X = add_keywords(X, 'abstract', keywords['abstract']) X_test = add_keywords(X_test, 'title', keywords['title']) X_test = add_keywords(X_test, 'abstract', keywords['abstract']) return X, y, X_test
def goto_definitions(self): """ Return the definitions of a the path under the cursor. goto function! This follows complicated paths and returns the end, not the first definition. The big difference between :meth:`goto_assignments` and :meth:`goto_definitions` is that :meth:`goto_assignments` doesn't follow imports and statements. Multiple objects may be returned, because Python itself is a dynamic language, which means depending on an option you can have two different versions of a function. :rtype: list of :class:`api_classes.Definition` """ def resolve_import_paths(scopes): for s in scopes.copy(): if isinstance(s, imports.ImportPath): scopes.remove(s) scopes.update(resolve_import_paths(set(s.follow()))) return scopes goto_path = self._module.get_path_under_cursor() context = self._module.get_context() scopes = set() lower_priority_operators = ("()", "(", ",") """Operators that could hide callee.""" if next(context) in ("class", "def"): scopes = set([self._module.parser.user_scope]) elif not goto_path: op = self._module.get_operator_under_cursor() if op and op not in lower_priority_operators: scopes = set([keywords.get_operator(op, self.pos)]) # Fetch definition of callee if not goto_path: (call, _) = self._func_call_and_param_index() if call is not None: while call.next is not None: call = call.next # reset cursor position: (row, col) = call.name.end_pos self.pos = (row, max(col - 1, 0)) self._module = modules.ModuleWithCursor(self._source_path, source=self.source, position=self.pos) # then try to find the path again goto_path = self._module.get_path_under_cursor() if not scopes: if goto_path: scopes = set(self._prepare_goto(goto_path)) elif op in lower_priority_operators: scopes = set([keywords.get_operator(op, self.pos)]) scopes = resolve_import_paths(scopes) # add keywords scopes |= keywords.get_keywords(string=goto_path, pos=self.pos) d = set([api_classes.Definition(s) for s in scopes if not isinstance(s, imports.ImportPath._GlobalNamespace)]) return self._sorted_defs(d)
def extract_keywords(): document = request.get_json()['text'] response = Response( response=json.dumps(list(get_keywords(document))), status=200, mimetype="application/json", ) return response
def main(tweets): tweet_clusters = scipy_clustering(tweets) print len(tweet_clusters), len(tweets) for number, cluster in enumerate(tweet_clusters[:10]): print "These are the keywords in cluster:", number + 1, "Length:", len( cluster["list"]), "Score:", cluster["score"] print kw.reduce_text_list( kw.get_keywords(kw.reduce_text_list(cluster["list"], as_list=True), 5)) print ""
def get_tweet(dict_tweet): tweet = "" tweet += "Tweet por @%s\n" % (dict_tweet["user"]["name"]) tweet += "%s\n" % (dict_tweet["text"]) try: tweet += "%s" % (dict_tweet["entities"]["urls"][0]["url"]) except: pass keys = keywords.get_keywords(dict_tweet["text"]) # keys = " ".join(keys) # url_image = list(get_google_image.get_scrapped_image(keys))[0] url_image = "" return tweet, url_image
def answer_add_question(answer, chat_id): global the_question_to_answer with connection.cursor() as cursor: query = f"SELECT * FROM QA WHERE question='{the_question_to_answer}' and chat_id={chat_id}" cursor.execute(query) result = cursor.fetchone() if result is None: lquest = [the_question_to_answer] keywords = get_keywords(lquest) query = f"INSERT INTO QA VALUES({chat_id}, '{answer}', '{the_question_to_answer}' , '{keywords}')" cursor.execute(query) connection.commit() requests.get( TELEGRAM_SEND_MESSAGE_URL.format( TOKEN, chat_id, "question added successfully!")) the_question_to_answer = ''
def ask_question(question, chat_id, class_): first_question = question similar_questions = [] with connection.cursor() as cursor: question = "'" + question + "'" lquestion = [question] keywords = get_keywords(lquestion) query = f"SELECT * FROM QA" cursor.execute(query) res = cursor.fetchall() split_keywords = keywords.split() for result in res: db_keywords = result['keywords'].split() for key in split_keywords: if key in db_keywords: similar_questions.append(result['question']) break if len(similar_questions) != 0: requests.get( TELEGRAM_SEND_MESSAGE_URL.format( TOKEN, chat_id, "It seems we have similar questions:\n" "Write %<number>" " to choose the question you want\n")) for i, question in enumerate(similar_questions): requests.get( TELEGRAM_SEND_MESSAGE_URL.format( TOKEN, chat_id, str(i) + ")" + " " + question + "\n")) requests.get( TELEGRAM_SEND_MESSAGE_URL.format( TOKEN, chat_id, "If you can't find your question, write 'none'")) else: requests.get( TELEGRAM_SEND_MESSAGE_URL.format( TOKEN, chat_id, "No similar questions were asked," " we will send your question to the teacher. Please wait for the answer.\n" )) query = f"INSERT INTO parentsQuestions VALUES({chat_id},'{first_question}')" cursor.execute(query) ask_question2(first_question, class_) connection.commit() return chat_id, first_question, similar_questions
def get_definition(self): """ Returns the definitions of a the path under the cursor. This is not a goto function! This follows complicated paths and returns the end, not the first definition. The big difference of goto and get_definition is that goto doesn't follow imports and statements. Multiple objects may be returned, because Python itself is a dynamic language, which means depending on an option you can have two different versions of a function. :return: list of Definition objects, which are basically scopes. :rtype: list """ def resolve_import_paths(scopes): for s in scopes.copy(): if isinstance(s, imports.ImportPath): scopes.remove(s) scopes.update(resolve_import_paths(set(s.follow()))) return scopes goto_path = self.module.get_path_under_cursor() context = self.module.get_context() if next(context) in ('class', 'def'): scopes = set([self.module.parser.user_scope]) elif not goto_path: op = self.module.get_operator_under_cursor() scopes = set([keywords.get_operator(op, self.pos)] if op else []) else: scopes = set(self._prepare_goto(goto_path)) scopes = resolve_import_paths(scopes) # add keywords scopes |= keywords.get_keywords(string=goto_path, pos=self.pos) d = set([ api_classes.Definition(s) for s in scopes if not isinstance(s, imports.ImportPath._GlobalNamespace) ]) return sorted(d, key=lambda x: (x.module_path, x.start_pos))
def get_definition(self): """ Returns the definitions of a the path under the cursor. This is not a goto function! This follows complicated paths and returns the end, not the first definition. The big difference of goto and get_definition is that goto doesn't follow imports and statements. Multiple objects may be returned, because Python itself is a dynamic language, which means depending on an option you can have two different versions of a function. :return: list of Definition objects, which are basically scopes. :rtype: list """ def resolve_import_paths(scopes): for s in scopes.copy(): if isinstance(s, imports.ImportPath): scopes.remove(s) scopes.update(resolve_import_paths(set(s.follow()))) return scopes goto_path = self.module.get_path_under_cursor() context = self.module.get_context() if next(context) in ('class', 'def'): scopes = set([self.module.parser.user_scope]) elif not goto_path: op = self.module.get_operator_under_cursor() scopes = set([keywords.get_operator(op, self.pos)] if op else []) else: scopes = set(self._prepare_goto(goto_path)) scopes = resolve_import_paths(scopes) # add keywords scopes |= keywords.get_keywords(string=goto_path, pos=self.pos) d = set([api_classes.Definition(s) for s in scopes if not isinstance(s, imports.ImportPath._GlobalNamespace)]) return sorted(d, key=lambda x: (x.module_path, x.start_pos))
def build_model004(df): from keywords import get_keywords x_cols = ['title', 'abstract'] df_train, df_test = split_train_test(df) X, y = getXy(df_train, x_cols) X_test, _ = getXy(df_test, x_cols) keywords = get_keywords(50) print('keywords=%s' % keywords) print('X before=%s:%s' % (list(X.shape), X.columns)) X = add_keywords(X, 'title', keywords['title']) X = add_keywords(X, 'abstract', keywords['abstract']) X_test = add_keywords(X_test, 'title', keywords['title']) X_test = add_keywords(X_test, 'abstract', keywords['abstract']) print('X after =%s:%s' % (list(X.shape), X.columns)) return X, y, X_test
def get_award(self, file_path): print "Converting xml file: " + file_path tree = ET.parse(file_path) root = tree.getroot() award = root.find('Award') # create a new AwardItem object from the xml data awardItem = AwardItem() awardItem['Title'] = award.find('AwardTitle').text awardItem['Date'] = award.find('AwardEffectiveDate').text amount = award.find('AwardAmount') if amount is not None: awardItem['Amount'] = amount.text # add institution institution = award.find('Institution') if institution is not None: awardItem['Institution'] = institution.find('Name').text # add the authors division = award.find('Division') if division is not None: awardItem['Division'] = division.find('LongName').text investigator = award.find('Investigator') if investigator is not None: awardItem['Investigator'] = investigator.find('FirstName').text + " " + investigator.find('LastName').text + " | " + investigator.find('EmailAddress').text abstract = award.find('AbstractNarration') if abstract is not None and abstract.text is not None: awardItem['Abstract'] = abstract.text awardItem['Keywords'] = get_keywords(abstract.text) return awardItem
def answer_the_last_question(answer, teacher_chat_id): with connection.cursor() as cursor: query = "SELECT * FROM parentsQuestionsQueue LIMIT 1" cursor.execute(query) result = cursor.fetchone() if result is not None: requests.get( TELEGRAM_SEND_MESSAGE_URL.format( TOKEN, result['chat_id'], "The answer for your last question is :\n" + answer)) answer = "'" + answer + "'" ques = "'" + result['question'] + "'" lques = [result['question']] keywords = get_keywords(lques) query2 = f"INSERT INTO QA VALUES({teacher_chat_id}, {answer}, {ques}, '{keywords}')" cursor.execute(query2) query3 = f"DELETE FROM parentsQuestionsQueue WHERE question = {ques}" cursor.execute(query3) connection.commit() else: requests.get( TELEGRAM_SEND_MESSAGE_URL.format( TOKEN, teacher_chat_id, "THERE IS NO QUESTION TO ANSWER!!"))
def on_data(self, data): try: json_data = json.loads(data) json_send_data = self.json_filter(json_data) json_send_data['senti_val'] = afinn.score(json_data['text']) json_send_data['sentiment'] = self.fun( afinn.score(json_data['text'])) json_send_data['keywords'] = keywords.get_keywords( json_data['text']) + keywords.extract_hashtag( json_data['text']) print(json_send_data['text'], " >>>>>>>> ", json_send_data['keywords'], " >>>>>>>> ", json_send_data['sentiment']) self.producer.send_messages(b'twitter', json.dumps(json_send_data)) client_mongo = pymongo.MongoClient('localhost', 27017) db = client_mongo['dicdatabase'] db.test.insert_one(json_send_data) return True except KeyError: return True
default="ptrs", choices=["ptrs", "make_ie_data", "prep_gen_data"], help="what utility function to run", ) parser.add_argument("--min-freq", type=int, default=0) parser.add_argument("--test", action="store_true", help="use test data") parser.add_argument("--lang", type=str, choices=["en", "de"], help="language", default="de") args = parser.parse_args() LANG = args.lang Keywords = get_keywords(LANG) if args.mode == "ptrs": make_pointerfi(args.output_fi, inp_file=args.input_path) elif args.mode == "make_ie_data": save_full_sent_data( args.output_fi, path=args.input_path, multilabel_train=True, min_freq=args.min_freq, ) elif args.mode == "prep_gen_data": prep_generated_data( args.gen_fi, args.dict_pfx, args.output_fi,
def kws(): text = request.args.get('text', None) return json.dumps({'keywords': keywords.get_keywords(text)})
def completions(self): """ Return :class:`api_classes.Completion` objects. Those objects contain information about the completions, more than just names. :return: Completion objects, sorted by name and __ comes last. :rtype: list of :class:`api_classes.Completion` """ debug.speed('completions start') path = self._module.get_path_until_cursor() if re.search('^\.|\.\.$', path): return [] path, dot, like = self._get_completion_parts(path) completion_line = self._module.get_line(self.pos[0])[:self.pos[1]] try: scopes = list(self._prepare_goto(path, True)) except NotFoundError: scopes = [] scope_generator = evaluate.get_names_of_scope( self._parser.user_scope, self.pos) completions = [] for scope, name_list in scope_generator: for c in name_list: completions.append((c, scope)) else: completions = [] debug.dbg('possible scopes', scopes) for s in scopes: if s.isinstance(er.Function): names = s.get_magic_method_names() else: if isinstance(s, imports.ImportPath): if like == 'import': if not completion_line.endswith('import import'): continue a = s.import_stmt.alias if a and a.start_pos <= self.pos <= a.end_pos: continue names = s.get_defined_names(on_import_stmt=True) else: names = s.get_defined_names() for c in names: completions.append((c, s)) if not dot: # named params have no dots for call_def in self.call_signatures(): if not call_def.module.is_builtin(): for p in call_def.params: completions.append((p.get_name(), p)) # Do the completion if there is no path before and no import stmt. u = self._parser.user_stmt bs = builtin.Builtin.scope if isinstance(u, pr.Import): if (u.relative_count > 0 or u.from_ns) and not re.search( r'(,|from)\s*$|import\s+$', completion_line): completions += ((k, bs) for k in keywords.get_keywords('import')) if not path and not isinstance(u, pr.Import): # add keywords completions += ((k, bs) for k in keywords.get_keywords( all=True)) needs_dot = not dot and path comps = [] comp_dct = {} for c, s in set(completions): n = c.names[-1] if settings.case_insensitive_completion \ and n.lower().startswith(like.lower()) \ or n.startswith(like): if not evaluate.filter_private_variable(s, self._parser.user_stmt, n): new = api_classes.Completion(c, needs_dot, len(like), s) k = (new.name, new.complete) # key if k in comp_dct and settings.no_completion_duplicates: comp_dct[k]._same_name_completions.append(new) else: comp_dct[k] = new comps.append(new) debug.speed('completions end') return sorted(comps, key=lambda x: (x.name.startswith('__'), x.name.startswith('_'), x.name.lower()))
def goto_definitions(self): """ Return the definitions of a the path under the cursor. goto function! This follows complicated paths and returns the end, not the first definition. The big difference between :meth:`goto_assignments` and :meth:`goto_definitions` is that :meth:`goto_assignments` doesn't follow imports and statements. Multiple objects may be returned, because Python itself is a dynamic language, which means depending on an option you can have two different versions of a function. :rtype: list of :class:`api_classes.Definition` """ def resolve_import_paths(scopes): for s in scopes.copy(): if isinstance(s, imports.ImportPath): scopes.remove(s) scopes.update(resolve_import_paths(set(s.follow()))) return scopes goto_path = self._module.get_path_under_cursor() context = self._module.get_context() scopes = set() lower_priority_operators = ('()', '(', ',') """Operators that could hide callee.""" if next(context) in ('class', 'def'): scopes = set([self._module.parser.user_scope]) elif not goto_path: op = self._module.get_operator_under_cursor() if op and op not in lower_priority_operators: scopes = set([keywords.get_operator(op, self.pos)]) # Fetch definition of callee if not goto_path: (call, _) = self._func_call_and_param_index() if call is not None: while call.next is not None: call = call.next # reset cursor position: (row, col) = call.name.end_pos self.pos = (row, max(col - 1, 0)) self._module = modules.ModuleWithCursor(self._source_path, source=self.source, position=self.pos) # then try to find the path again goto_path = self._module.get_path_under_cursor() if not scopes: if goto_path: scopes = set(self._prepare_goto(goto_path)) elif op in lower_priority_operators: scopes = set([keywords.get_operator(op, self.pos)]) scopes = resolve_import_paths(scopes) # add keywords scopes |= keywords.get_keywords(string=goto_path, pos=self.pos) d = set([ api_classes.Definition(s) for s in scopes if not isinstance(s, imports.ImportPath._GlobalNamespace) ]) return self._sorted_defs(d)
def complete(self): """ An auto completer for python files. :return: list of Completion objects, sorted by name and __ comes last. :rtype: list """ def follow_imports_if_possible(name): # TODO remove this, or move to another place (not used) par = name.parent if isinstance(par, parsing.Import) and not \ isinstance(self.parser.user_stmt, parsing.Import): new = imports.ImportPath(par).follow(is_goto=True) # Only remove the old entry if a new one has been found. #print par, new, par.parent if new: try: return new except AttributeError: # .name undefined pass return [name] debug.speed('complete start') path = self.module.get_path_until_cursor() if re.search('^\.|\.\.$', path): return [] path, dot, like = self._get_completion_parts(path) try: scopes = list(self._prepare_goto(path, True)) except NotFoundError: scopes = [] scope_generator = evaluate.get_names_for_scope( self.parser.user_scope, self.pos) completions = [] for scope, name_list in scope_generator: for c in name_list: completions.append((c, scope)) else: completions = [] debug.dbg('possible scopes', scopes) for s in scopes: if s.isinstance(evaluate.Function): names = s.get_magic_method_names() else: if isinstance(s, imports.ImportPath): if like == 'import': l = self.module.get_line(self.pos[0])[:self.pos[1]] if not l.endswith('import import'): continue names = s.get_defined_names(on_import_stmt=True) else: names = s.get_defined_names() for c in names: completions.append((c, s)) if not dot: # named_params have no dots call_def = self.get_in_function_call() if call_def: if not call_def.module.is_builtin(): for p in call_def.params: completions.append((p.get_name(), p)) # Do the completion if there is no path before and no import stmt. if (not scopes or not isinstance(scopes[0], imports.ImportPath)) \ and not path: # add keywords bs = builtin.Builtin.scope completions += ((k, bs) for k in keywords.get_keywords( all=True)) needs_dot = not dot and path comps = [] for c, s in set(completions): n = c.names[-1] if settings.case_insensitive_completion \ and n.lower().startswith(like.lower()) \ or n.startswith(like): if not evaluate.filter_private_variable(s, self.parser.user_stmt, n): new = api_classes.Completion(c, needs_dot, len(like), s) comps.append(new) debug.speed('complete end') return sorted(comps, key=lambda x: (x.word.startswith('__'), x.word.startswith('_'), x.word.lower()))
"""" Utility for converting JSON-formatted review data into tabular form. """ import json from tqdm import tqdm from keywords import get_keywords file_name = "Clothing_Shoes_and_Jewelry_5.json" col_names = ["id", "text", "rating"] tsv_name = file_name.replace("json", "tsv") output = open(tsv_name, 'w') output.write("\t".join(["id", "rating", "keyword", "text"])) output.close() i = 0 with open(file_name) as f: for line in tqdm(f, desc="Reading from JSON"): # do stuff with this one line review = json.loads(line) text = review["reviewText"] rating = review["overall"] keyword = get_keywords(text) output = open(tsv_name, 'a') output.write("\n") output.write("\t".join([str(i), str(rating), keyword, text])) output.close() i += 1 # for smaller sets, use: # if i >= 25: # break
def produce_v2_vector_row(rid, result, doctype, keyword_coordinates, found_entities): if "ENTITY TYPE" not in result: return None found_entity_type = resolve_entity_type(result, found_entities) entity_type = result.get("CHILD ENTITY TYPE", result.get("ENTITY TYPE")) keywords = get_keywords(doctype) row = get_empty_row(keywords) datapoint = result["value"]["value"] row["rid"] = rid row["label"] = result["NAME"] page_num = get_attr_list(["selection_input", "page", "document_index"], result) page_num += 1 page_name = f"page_{page_num}" if page_name not in keyword_coordinates: page_name = list(keyword_coordinates.keys())[0] keywords_on_the_page = keyword_coordinates[page_name] x_normal, y_normal = ( keywords_on_the_page["page_size"]["x"], keywords_on_the_page["page_size"]["y"], ) center_coordinates = {"x0": 0, "y0": 0, "x1": x_normal, "y1": y_normal} row["entity_id"] = None row["page_number"] = float(page_num) row["entity_type"] = entity_type row["doctype"] = doctype row["currency"] = coerce_currency(datapoint, row["entity_type"]) row["date"] = dateable(row["datapoint"], common_date_patterns, row["entity_type"]) datapoint_coordinates = get_attr_list(["selection_input", "pos_original"], result) # No coordinates? No vector if datapoint_coordinates is None: return None distance_af_center = calculate_distance_and_angle(center_coordinates, datapoint_coordinates, x_normal, y_normal) row["df_center"] = distance_af_center["distance"] row["af_center"] = distance_af_center["angle"] for keyword_value, keyword_coordinates_lst in keywords_on_the_page[ "keyword_coordinates"].items(): vector_keyword = find_closest_coordinates( keyword_coordinates_lst, datapoint_coordinates, x_normal, y_normal, ) row[f"dfk_{keyword_value}"] = vector_keyword["distance"] row[f"afk_{keyword_value}"] = vector_keyword["angle"] return row
def produce_tagged_vector_row(rid, last_row, doctype, keyword_coordinates): page_name = get_default_page_name(last_row["page_number"], list(keyword_coordinates.keys())) keywords = get_keywords(doctype) row = get_empty_row(keywords) if page_name not in keyword_coordinates: page_name = list(keyword_coordinates.keys())[0] keywords_on_the_page = keyword_coordinates[page_name] x_normal, y_normal = ( keywords_on_the_page["page_size"]["x"], keywords_on_the_page["page_size"]["y"], ) center_coordinates = {"x0": 0, "y0": 0, "x1": x_normal, "y1": y_normal} datapoint = last_row["content"] row["rid"] = rid row["label"] = last_row["datapoint"] row["entity_id"] = last_row["id"] row["page_number"] = float(page_name.replace("page_", "")) row["doctype"] = doctype row["entity_type"] = last_row["entity_type"] row["currency"] = coerce_currency(last_row["content"], row["entity_type"]) row["date"] = dateable(last_row["content"], common_date_patterns, row["entity_type"]) if type(last_row["coordinates"]) != str: return False x0, y0, x1, y1 = tuple( float(x) for x in last_row["coordinates"].split(" ")) datapoint_coordinates = {"y0": y0, "x0": x0, "y1": y1, "x1": x1} distance_af_center = calculate_distance_and_angle(center_coordinates, datapoint_coordinates, x_normal, y_normal) row["currency"] = coerce_currency(datapoint, row["entity_type"]) row["date"] = dateable(last_row["content"], common_date_patterns, row["entity_type"]) distance_af_center = calculate_distance_and_angle(center_coordinates, datapoint_coordinates, x_normal, y_normal) row["df_center"] = distance_af_center["distance"] row["af_center"] = distance_af_center["angle"] for keyword_value, keyword_coordinates_lst in keywords_on_the_page[ "keyword_coordinates"].items(): vector_keyword = find_closest_coordinates( keyword_coordinates_lst, datapoint_coordinates, x_normal, y_normal, ) row[f"dfk_{keyword_value}"] = vector_keyword["distance"] row[f"afk_{keyword_value}"] = vector_keyword["angle"] return row
def produce_found_vector_row(rid, entity, doctype, keyword_coordinates, label=0): addr_mapper = { "AddressNumber": "ROUTE", "StreetName": "ROUTE", "StreetNamePostType": "ROUTE", "StreetNamePreDirectional": "ROUTE", "PlaceName": "CITY", "StateName": "STATE", "ZipCode": "ZIP", } datapoint = entity["CONTENT"]["CONTENT"]["value"] entity_type = entity["ENTITY_TYPE"] if entity_type == "ADDRESS": parsed_address = usaddress.parse(datapoint) ent_type_set = list( set(addr_mapper.get(tup[1], "ADDRESS") for tup in parsed_address)) if len(ent_type_set) == 1: entity_type = ent_type_set[0] if entity_type == "ADDRESS": print("# ", datapoint) keywords = get_keywords(doctype) row = get_empty_row(keywords) page_num = entity["PAGE"] page_name = f"page_{page_num}" if page_name not in keyword_coordinates: page_name = list(keyword_coordinates.keys())[0] keywords_on_the_page = keyword_coordinates[page_name] x_normal, y_normal = ( keywords_on_the_page["page_size"]["x"], keywords_on_the_page["page_size"]["y"], ) center_coordinates = {"x0": 0, "y0": 0, "x1": x_normal, "y1": y_normal} row["rid"] = rid row["label"] = label row["entity_id"] = entity["ENTITY_ID"] row["page_number"] = float(page_num) row["entity_type"] = entity_type row["doctype"] = doctype row["currency"] = coerce_currency(datapoint, entity_type) row["date"] = dateable(entity["CONTENT"]["CONTENT"]["value"], common_date_patterns, entity_type) # Try to match up with tagged entities datapoint_coordinates = entity["CONTENT"]["CONTENT"]["coordinates"] distance_af_center = calculate_distance_and_angle(center_coordinates, datapoint_coordinates, x_normal, y_normal) row["df_center"], row["af_center"] = ( distance_af_center["distance"], distance_af_center["angle"], ) for keyword_value, keyword_coordinates_lst in keywords_on_the_page[ "keyword_coordinates"].items(): vector_keyword = find_closest_coordinates( keyword_coordinates_lst, datapoint_coordinates, x_normal, y_normal, ) row[f"dfk_{keyword_value}"], row[f"afk_{keyword_value}"] = ( vector_keyword["distance"], vector_keyword["angle"], ) return row
def completions(self): """ Return :class:`api_classes.Completion` objects. Those objects contain information about the completions, more than just names. :return: Completion objects, sorted by name and __ comes last. :rtype: list of :class:`api_classes.Completion` """ debug.speed('completions start') path = self._module.get_path_until_cursor() if re.search('^\.|\.\.$', path): return [] path, dot, like = self._get_completion_parts(path) completion_line = self._module.get_line(self.pos[0])[:self.pos[1]] try: scopes = list(self._prepare_goto(path, True)) except NotFoundError: scopes = [] scope_generator = evaluate.get_names_of_scope( self._parser.user_scope, self.pos) completions = [] for scope, name_list in scope_generator: for c in name_list: completions.append((c, scope)) else: completions = [] debug.dbg('possible scopes', scopes) for s in scopes: if s.isinstance(er.Function): names = s.get_magic_method_names() else: if isinstance(s, imports.ImportPath): if like == 'import': if not completion_line.endswith('import import'): continue a = s.import_stmt.alias if a and a.start_pos <= self.pos <= a.end_pos: continue names = s.get_defined_names(on_import_stmt=True) else: names = s.get_defined_names() for c in names: completions.append((c, s)) if not dot: # named params have no dots for call_def in self.call_signatures(): if not call_def.module.is_builtin(): for p in call_def.params: completions.append((p.get_name(), p)) # Do the completion if there is no path before and no import stmt. u = self._parser.user_stmt bs = builtin.Builtin.scope if isinstance(u, pr.Import): if (u.relative_count > 0 or u.from_ns) and not re.search( r'(,|from)\s*$|import\s+$', completion_line): completions += ((k, bs) for k in keywords.get_keywords('import')) if not path and not isinstance(u, pr.Import): # add keywords completions += ((k, bs) for k in keywords.get_keywords(all=True)) needs_dot = not dot and path comps = [] comp_dct = {} for c, s in set(completions): n = c.names[-1] if settings.case_insensitive_completion \ and n.lower().startswith(like.lower()) \ or n.startswith(like): if not evaluate.filter_private_variable( s, self._parser.user_stmt, n): new = api_classes.Completion(c, needs_dot, len(like), s) k = (new.name, new.complete) # key if k in comp_dct and settings.no_completion_duplicates: comp_dct[k]._same_name_completions.append(new) else: comp_dct[k] = new comps.append(new) debug.speed('completions end') return sorted( comps, key=lambda x: (x.name.startswith('__'), x.name.startswith('_'), x.name.lower()))
from time import time import falcon import ujson as json from keywords import get_db, get_keywords from hortiradar import admins, users, time_format from hortiradar.database import stop_words from hortiradar.clustering import Config db = get_db() tweets = db.tweets groups = db.groups KEYWORDS = get_keywords(local=True) keywords_sync_time = time() spam_level = Config.getfloat("database:parameters", "spam_level") def get_dates(req, resp, resource, params): """Parse the `start` and `end` datetime parameters.""" try: today = datetime.today() today = datetime(today.year, today.month, today.day) start = req.get_param("start") if start: start = datetime.strptime(start, time_format) else: start = today - timedelta(days=1) end = req.get_param("end")
# -*- coding: utf-8 -*- from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.support.wait import WebDriverWait from keywords import get_keywords, save_keyword_index import re import urllib import time if __name__ == '__main__': browser = webdriver.PhantomJS() try: keyword_list = get_keywords() for k in keyword_list: keyword = k[1].decode('utf8') keyword = urllib.quote(keyword.encode('utf8')) browser.get( 'http://zhishu.sogou.com/index/searchHeat?kwdNamesStr=%s&timePeriodType=MONTH&dataType=SEARCH_ALL&queryType=INPUT' % keyword) try: r = re.findall( r'root.SG.data = {"pvList":\[([\s\S]*)],"infoList"', browser.page_source, re.M) except: r = '' if r: points = eval(r[0].split('],"infoList"')[0]) for p in points: date = str(p['date']) date = date[0:4] + '-' + date[4:6] + '-' + date[6:8]
def complete(self): """ An auto completer for python files. :return: list of Completion objects, sorted by name and __ comes last. :rtype: list """ def follow_imports_if_possible(name): # TODO remove this, or move to another place (not used) par = name.parent if isinstance(par, parsing.Import) and not \ isinstance(self.parser.user_stmt, parsing.Import): new = imports.ImportPath(par).follow(is_goto=True) # Only remove the old entry if a new one has been found. #print par, new, par.parent if new: try: return new except AttributeError: # .name undefined pass return [name] debug.speed('complete start') path = self.module.get_path_until_cursor() path, dot, like = self._get_completion_parts(path) try: scopes = list(self._prepare_goto(path, True)) except NotFoundError: scopes = [] scope_generator = evaluate.get_names_for_scope( self.parser.user_scope, self.pos) completions = [] for scope, name_list in scope_generator: for c in name_list: completions.append((c, scope)) else: completions = [] debug.dbg('possible scopes', scopes) for s in scopes: if s.isinstance(evaluate.Function): names = s.get_magic_method_names() else: if isinstance(s, imports.ImportPath): if like == 'import': l = self.module.get_line(self.pos[0])[:self.pos[1]] if not l.endswith('import import'): continue names = s.get_defined_names(on_import_stmt=True) else: names = s.get_defined_names() for c in names: completions.append((c, s)) if not dot: # named_params have no dots call_def = self.get_in_function_call() if call_def: if not call_def.module.is_builtin(): for p in call_def.params: completions.append((p.get_name(), p)) # Do the completion if there is no path before and no import stmt. if (not scopes or not isinstance(scopes[0], imports.ImportPath)) \ and not path: # add keywords bs = builtin.Builtin.scope completions += ((k, bs) for k in keywords.get_keywords(all=True)) needs_dot = not dot and path comps = [] for c, s in set(completions): n = c.names[-1] if settings.case_insensitive_completion \ and n.lower().startswith(like.lower()) \ or n.startswith(like): if not evaluate.filter_private_variable( s, self.parser.user_stmt, n): new = api_classes.Completion(c, needs_dot, len(like), s) comps.append(new) debug.speed('complete end') return sorted( comps, key=lambda x: (x.word.startswith('__'), x.word.startswith('_'), x.word.lower()))
def main(): #just for testing print("hi") #joe_nouns = (get_words_by_pos('joe-biden', 'noun')) #print(joe_nouns) #joe_verbs = (get_words_by_pos('joe-biden', 'verb')) #joe_adjectives=(get_words_by_pos('joe-biden', 'adjective')) #joe_adverbs=(get_words_by_pos('joe-biden', 'adverb')) #elizabeth_nouns = open("elizabeth-nouns.txt", "w") #elizabeth_nouns=(get_words_by_pos('elizabeth-warren', 'noun')) #elizabeth_nouns.close() #lizabeth_verbs = open("elizabeth-verbs.txt", "w") #elizabeth_verbs=(get_words_by_pos('elizabeth-warren', 'verb')) #elizabeth_verbs.close() #elizabeth_adjectives = open("elizabeth-adjs.txt", "w") #elizabeth_adjectives=(get_words_by_pos('elizabeth-warren', 'adjective')) #elizabeth_adjectives.close() #lizabeth_adverbs = open("elizabeth-adverbs.txt", "w") #elizabeth_adverbs=(get_words_by_pos('elizabeth-warren', 'adverb')) #elizabeth_adverbs.close() #bernie_nouns = open("bernie-nouns.txt", "w") #bernie_nouns=(get_words_by_pos('bernie-sanders', 'noun')) #bernie_nouns.close() #bernie_verbs = open("bernie-verbs.txt", "w") #bernie_verbs=(get_words_by_pos('bernie-sanders', 'verb')) #bernie_verbs.close() #bernie_adjectives = open("bernie-adjs.txt", "w") #bernie_adjectives=(get_words_by_pos('bernie-sanders', 'adjective')) #bernie_adjectives.close() #bernie_adverbs = open("bernie-adverbs.txt", "w") #bernie_adverbs=(get_words_by_pos('bernie-sanders', 'adverb')) #bernie_adverbs.close() #pete_nouns = open("pete-nouns.txt", "w") #pete_nouns=(get_words_by_pos('pete-buttigieg', 'noun')) #pete_nouns.close() #pete_verbs = open("pete-verbs.txt", "w") #pete_verbs=(get_words_by_pos('pete-buttigieg', 'verb')) #pete_verbs.close() #bernie_adjectives = open("pete-adjs.txt", "w") #pete_adjectives=(get_words_by_pos('pete-buttigieg', 'adjective')) # #pete_adverbs = open("pete-adverbs.txt", "w") #pete_adverbs=(get_words_by_pos('pete-buttigieg', 'adverb')) #pete_adverbs.close() #amy_nouns = open("amy-nouns.txt", "w") #amy_nouns=(get_words_by_pos('amy-klobuchar', 'noun')) #amy_nouns.close() #amy_verbs = open("amy-verbs.txt", "w") #amy_verbs=(get_words_by_pos('amy-klobuchar', 'verb')) #amy_verbs.close() #amy_adjectives = open("amy-adjs.txt", "w") #amy_adjectives=(get_words_by_pos('amy-klobuchar', 'adjective')) #amy_adjectives.close() #amy_adverbs = open("amy-adverbs.txt", "w") #amy_adverbs=(get_words_by_pos('amy-klobuchar', 'adverb')) #amy_adverbs.close() #donald_nouns = open("donald-nouns.txt", "w") #donald_nouns=(get_words_by_pos('donald-trump', 'noun')) #donald_nouns.close() #onald_verbs = open("donald-verbs.txt", "w") #donald_verbs=(get_words_by_pos('donald-trump', 'verb')) #donald_verbs.close() #donald_adjectives = open("donald-adjs.txt", "w") #donald_adjectives=(get_words_by_pos('donald-trump', 'adjective')) #donald_adjectives.close() #donald_adverbs = open("donald-adverbs.txt", "w") #donald_adverbs=(get_words_by_pos('donald-trump', 'adverb')) #donald_adverbs.close() #print(get_most_frequent_words(joe_nouns, 10)) #print(joe_adjectives) #print(get_most_frequent_words(donald_adjectives, 10)) print('bernie') print(map_occurances('bernie-sanders', keywords.get_keywords())) print('joe') print(map_occurances('joe-biden', keywords.get_keywords())) print('donald') print(map_occurances('donald-trump', keywords.get_keywords())) print('elizabeth-warren') print(map_occurances('elizabeth-warren', keywords.get_keywords())) print('pete-buttigieg') print(map_occurances('pete-buttigieg', keywords.get_keywords())) print('amy-klobuchar') print(map_occurances('amy-klobuchar', keywords.get_keywords()))
def analyze_text_block(text, sentiment_library="textblob", entity_library="spacy", get_sentiment_per_topic=True): text = re.sub('\s+', ' ', text) text = text.replace("\n", ' ') entities_res = entities.get_entities(text, library=entity_library) keywords_res = keywords.get_keywords(text) sentiment_res = sentiment.get_sentiment(text, library=sentiment_library) lemmas_dict = {} # Calculate sentiment per lemmas, keywords and entities, by averaging # the sentiment for all the sentences that they appear in: if get_sentiment_per_topic: blob = TextBlob(text) for sentence in blob.sentences: sentence_score = sentiment.get_sentiment( str(sentence), library=sentiment_library)['sentiment.score'] sentence_lemmas = lemmas.get_lemmas(sentence) sentence = str(sentence).lower() for lemma in sentence_lemmas: lemmatxt = lemma['text'] if lemmatxt in lemmas_dict.keys(): lemmas_dict[lemmatxt]['sentiment.score'].append( sentence_score) else: lemmas_dict[lemmatxt] = { 'sentiment.score': [sentence_score] } for keyword in keywords_res: word = keyword['text'] if word.lower() in sentence: if 'sentiment.score' not in keyword.keys(): keyword['sentiment.score'] = [] keyword['sentiment.score'].append(sentence_score) for entity in entities_res: word = entity['text'] if word.lower() in sentence: if 'sentiment.score' not in entity.keys(): entity['sentiment.score'] = [] entity['sentiment.score'].append(sentence_score) for keyword in keywords_res: # WARNING: This is a hack. Happens when we have different libraries not agreeing on sentence boundaries! if 'sentiment.score' not in keyword.keys(): keyword['sentiment.score'] = [sentiment_res['sentiment.score']] keyword['num.sentences'] = len(keyword['sentiment.score']) keyword['sentiment.score'] = np.mean(keyword['sentiment.score']) for entity in entities_res: # WARNING: This is a hack. Happens when we have different libraries not agreeing on sentence boundaries! if 'sentiment.score' not in entity.keys(): entity['sentiment.score'] = [sentiment_res['sentiment.score']] entity['num.sentences'] = len(entity['sentiment.score']) entity['sentiment.score'] = np.mean(entity['sentiment.score']) lemmas_res = [] for lemma in lemmas_dict.keys(): scores = lemmas_dict[lemma]['sentiment.score'] lemmas_res.append({ 'text': lemma, 'num.sentences': len(scores), 'sentiment.score': np.mean(scores) }) else: lemmas_res = lemmas.get_lemmas(text) results = { 'entities': entities_res, 'sentiment': sentiment_res, 'keywords': keywords_res, 'lemmas': lemmas_res } return (results)