def build_query_domains(js): google_domains = set() # google search with mld_guesses mld_guesses = keywords.guess_mld(js) if mld_guesses: mld_guess_str = ' '.join(['"{}"'.format(x) for x in mld_guesses]) logger.print("mld guesses: {}".format(mld_guess_str)) urls = fetch_urls(mld_guess_str) google_domains |= extract_domains(urls) else: logger.print("no mld guesses") langid = 'en' # langid = goslate.Goslate().detect(js['text']) logger.print("langid: {}".format(langid)) # google search with keywords keyw = keywords.keywords(js, max_count=MAXCOUNT, boost=True, langid=langid) # keyw = keywords.keywords(js, max_count=MAXCOUNT, augment=False) keywstring = ' '.join(keyw) if keywstring: logger.print("keywords: {}".format(keywstring)) urls = fetch_urls(keywstring) google_domains |= extract_domains(urls) else: logger.print("no keywords") # google search with augmented keywords augkeyw = keywords.keywords(js, max_count=MAXCOUNT, augment=True, langid=langid) augkeywstring = ' '.join(augkeyw) if augkeywstring != keywstring: logger.print("augmented keywords: {}".format(augkeywstring)) urls = fetch_urls(augkeywstring) google_domains |= extract_domains(urls) return google_domains, keyw, augkeyw
def __init__( self ): self.lib_version = '1.0.0' self.api_key = None self.api_private = None self.base_url = 'https://rest.quiubas.com' self.version = '1.0' self.network = network( self ) self.balance = balance( self ) self.callback = callback( self ) self.keywords = keywords( self ) self.sms = sms( self )
def textrank(fileName, original='pagerank', summarize_by=SENTENCE, ratio=0.2, words=None): path = '../Raw_text/' + fileName File = open(path) #open file text = File.read() #read all lines if original == 'suraj': #print 'with lexchain' namscores = LexicalChain(fileName=path) else: namscores = [] #print namscores if summarize_by == SENTENCE: return summarize(text, namscores, original, ratio, words) else: return keywords(text, ratio, words)
def detail_case(self, browser, case, casecount): a = time.time() failcount = 0 faillist = [] for item in range(len(case)): if case[0] != "": try: c = case[0] c = c.split(":") c1 = keywords.keywords() flag = c1.filter(browser, c[0]) if flag == 1: case.remove(case[0]) elif flag == 0: print("failed: %s" % (c[0])) faillist.append("Test Case%02d:%s" % (casecount, c[0])) case[0] = "" except Exception as e: print(e) print("failed2222222 : %s:%s" % (c[0], c[1])) b = time.time() if flag == 1: print("1 case pass, in %.3f s\n" % (b - a)) elif flag == 0: failcount += 1 print("1 case failed, in %.3f s\n" % (b - a)) time.sleep(1) browser.quit() return (failcount, faillist)
def goto_definitions(self): """ Return the definitions of a the path under the cursor. goto function! This follows complicated paths and returns the end, not the first definition. The big difference between :meth:`goto_assignments` and :meth:`goto_definitions` is that :meth:`goto_assignments` doesn't follow imports and statements. Multiple objects may be returned, because Python itself is a dynamic language, which means depending on an option you can have two different versions of a function. :rtype: list of :class:`api_classes.Definition` """ def resolve_import_paths(scopes): for s in scopes.copy(): if isinstance(s, imports.ImportPath): scopes.remove(s) scopes.update(resolve_import_paths(set(s.follow()))) return scopes goto_path = self._module.get_path_under_cursor() context = self._module.get_context() scopes = set() lower_priority_operators = ('()', '(', ',') """Operators that could hide callee.""" if next(context) in ('class', 'def'): scopes = set([self._module.parser.user_scope]) elif not goto_path: op = self._module.get_operator_under_cursor() if op and op not in lower_priority_operators: scopes = set([keywords.get_operator(op, self._pos)]) # Fetch definition of callee, if there's no path otherwise. if not goto_path: (call, _) = self._func_call_and_param_index() if call is not None: while call.next is not None: call = call.next # reset cursor position: (row, col) = call.name.end_pos _pos = (row, max(col - 1, 0)) self._module = modules.ModuleWithCursor(self._source_path, source=self.source, position=_pos) # then try to find the path again goto_path = self._module.get_path_under_cursor() if not scopes: if goto_path: scopes = set(self._prepare_goto(goto_path)) elif op in lower_priority_operators: scopes = set([keywords.get_operator(op, self._pos)]) scopes = resolve_import_paths(scopes) # add keywords scopes |= keywords.keywords(string=goto_path, pos=self._pos) d = set([ api_classes.Definition(s) for s in scopes if not isinstance(s, imports.ImportPath._GlobalNamespace) ]) return self._sorted_defs(d)
def goto_definitions(self): """ Return the definitions of a the path under the cursor. goto function! This follows complicated paths and returns the end, not the first definition. The big difference between :meth:`goto_assignments` and :meth:`goto_definitions` is that :meth:`goto_assignments` doesn't follow imports and statements. Multiple objects may be returned, because Python itself is a dynamic language, which means depending on an option you can have two different versions of a function. :rtype: list of :class:`api_classes.Definition` """ def resolve_import_paths(scopes): for s in scopes.copy(): if isinstance(s, imports.ImportPath): scopes.remove(s) scopes.update(resolve_import_paths(set(s.follow()))) return scopes goto_path = self._module.get_path_under_cursor() context = self._module.get_context() scopes = set() lower_priority_operators = ('()', '(', ',') """Operators that could hide callee.""" if next(context) in ('class', 'def'): scopes = set([self._module.parser.user_scope]) elif not goto_path: op = self._module.get_operator_under_cursor() if op and op not in lower_priority_operators: scopes = set([keywords.get_operator(op, self._pos)]) # Fetch definition of callee, if there's no path otherwise. if not goto_path: (call, _) = self._func_call_and_param_index() if call is not None: while call.next is not None: call = call.next # reset cursor position: (row, col) = call.name.end_pos _pos = (row, max(col - 1, 0)) self._module = modules.ModuleWithCursor( self._source_path, source=self.source, position=_pos) # then try to find the path again goto_path = self._module.get_path_under_cursor() if not scopes: if goto_path: scopes = set(self._prepare_goto(goto_path)) elif op in lower_priority_operators: scopes = set([keywords.get_operator(op, self._pos)]) scopes = resolve_import_paths(scopes) # add keywords scopes |= keywords.keywords(string=goto_path, pos=self._pos) d = set([api_classes.Definition(s) for s in scopes if s is not imports.ImportPath.GlobalNamespace]) return self._sorted_defs(d)
def textrank(text, summarize_by=SENTENCE, ratio=0.2, words=None): if summarize_by == SENTENCE: return summarize(text, ratio, words) else: return keywords(text, ratio, words)
def textrank(text1, text2, summarize_by=SENTENCE, ratio=DEFAULT_RATIO, words=None, additional_stopwords=None): if summarize_by == SENTENCE: return summarize(text1, text2, ratio, words, additional_stopwords=additional_stopwords) else: return keywords(text1, text2, ratio, words, additional_stopwords=additional_stopwords)
keytrans = [] ini_tot = [] tot = [] for file_i in files: with codecs.open(path + '/' + file_i, 'r', 'UTF-8') as f: passage = f.read() lista = re.findall(r"[\w']+", passage) cap_list = [word.upper() for word in lista] scriptlist = RemStopW(cap_list, cap_stopwords) #list script = ' '.join(scriptlist) #script - cleaned # ============== # Find key-words # ============== # For each text find the 20-most common key-words (single=k1, bigram=k2 or trigram=k3) # excluding all the stop-words [ini_tot_words, tot_words, k1, k2, k3] = keywords(script) keytrans.append([k1, k2, k3]) #list of list - k1,k2,k3 for each text ini_tot.append(ini_tot_words) # total # of words in the original text tot.append(tot_words) # total # of words counted after cleaning # ================== # Printing some info # ================== for i, k in enumerate(keytrans): print('Total number of words in ' + files[i]) print(str(tot[i])) print('The 10 most common single key-words in ' + files[i]) for w in k[0][0:11]: print(str(w)) print('The 5 most common bigram key-words in ' + files[i]) for w in k[1][0:6]: print(str(w))
def main_case(self, case, casecount): browser, case = keywords.keywords().case_main(case) failcount, faillist = self.detail_case(browser, case, casecount) return (failcount, faillist)
def is_phish2(ws=None): """ Decide whether a website is phishing using its keywords and a Google search based on those. Parameters ---------- ws: website object or None contains all downloaded information about the site Returns ------- rank: int * -1 = unresolved, fetching a website failed * 0 = not phish * 1 = suspicious * 2 = phish description: str above description for the numerical values targets: set potential targets in case of 1 or 2, empty when rank is -1, 0 """ if ws is None: # logger.print("website object is None is empty; cannot continue") return -1, 'unresolved', set() # logger.print("siteid: {}".format(ws.siteid)) # logger.print("landing url: {}".format(ws.landurl[:80])) mld, ps = keywords.split_mld_ps(ws.landurl) # 1. TESTS # password? pw = ws.has_password # if pw: # logger.print("asks for a password") # else: # logger.print("does not ask for a password") google_domains, keyw, augkeyw = extract_domains( ws.js['urls_keywords'] + ws.js['urls_augmented']), ws.keywords(), ws.augmented_keywords() # logger.print("query-domains:") # for dom in google_domains: # logger.print('.'.join(dom), nots=True) if pw: # logger.print("password found") prominent_domains_found = prominent_domains(ws.js, keyw + augkeyw, google_domains, extend_search=False) mld_in_gmld = (mld, ps) in google_domains if mld_in_gmld: # logger.print("a query-mld matches with site-mld -> not phish") # return 0 return 0, 'not phish', set() else: # logger.print("no query-mld matches with site-mld") if prominent_domains_found: # logger.print("prominent domains found -> phish") # return 2 return 2, 'phish', prominent_domains_found else: # logger.print("did not find prominent mlds") # logger.print("doing ocr") ocrkeyw = keywords.keywords(ws.js, max_count=MAXCOUNT, augment=True, use_ocr=True) keywstring = ' '.join(ocrkeyw) # logger.print("ocr keywords: {}".format(keywstring)) urls = ws.js['urls_ocr'] google_domains = extract_domains(urls) prominent_domains_found = prominent_domains(ws.js, ocrkeyw, google_domains, extend_search=True) mld_in_gmld = (mld, ps) in google_domains if mld_in_gmld: # logger.print("a query-mld matches with site-mld -> not phish") # return 0 return 0, 'not phish', set() else: # logger.print("no query-mld matches with site-mld") if prominent_domains_found: # logger.print("prominent domains found -> phish") # return 2 return 2, 'phish', prominent_domains_found else: # logger.print("prominent domains not found -> possibly phish") # return 1 return 1, 'suspicious', set() else: # logger.print("password not found") prominent_domains_found = prominent_domains(ws.js, keyw, google_domains, extend_search=False) mld_in_gmld = (mld, ps) in google_domains if mld_in_gmld: # logger.print("a query-mld matches with site-mld -> not phish") # return 0 return 0, 'not phish', set() else: # logger.print("no query-mld matches with site-mld") if prominent_domains_found: # logger.print("prominent domains found -> suspicious") # return 1 return 1, 'suspicious', prominent_domains_found else: # logger.print("prominent domains not found -> not phish") # return 0 return 0, 'not phish', set()
def is_phish(js={}, jspath='', url=''): """ Decide whether a website is phishing using its keywords and a Google search based on those. Parameters ---------- js: dict, optional contains site data jspath: str, optional path to a json file with the site data url: str, optional url of a website Returns ------- rank: int * -1 = unresolved, fetching a website failed * 0 = not phish * 1 = suspicious * 2 = phish description: str above description for the numerical values targets: set potential targets in case of 1 or 2, empty when rank is -1, 0 """ # load json if url: fetcher = website_fetcher.WebsiteFetcher(logging=True, confirm=True) # sitedata, screenshot = fetcher.fetch_sitedata_and_screenshot(url) # js = sitedata jspath, sspath = fetcher.fetch_and_save_data(url) js = _load_json(jspath) elif jspath: js = _load_json(jspath) sspath = _get_screenshot_path(jspath) if not js: logger.print("json file is empty; cannot continue") return -1, 'unresolved', set() logger.print("siteid: {}".format(js['siteid'])) landurl = js['landurl'] # logger.print("landing url: {}".format(landurl[:80])) logger.print("loglinks:") for link in js['loglinks']: logger.print(link, nots=True) mld, ps = keywords.split_mld_ps(landurl) # logger.print("main level domain: {}".format(mld)) # 1. TESTS # password? pw = _asks_password(js) if pw: logger.print("asks for a password") else: logger.print("does not ask for a password") google_domains, keyw, augkeyw = build_query_domains(js) logger.print("query-domains:") for dom in google_domains: logger.print('.'.join(dom), nots=True) if pw: # logger.print("asks for a password") prominent_domains_found = prominent_domains(js, keyw + augkeyw, google_domains, extend_search=False) mld_in_gmld = (mld, ps) in google_domains if mld_in_gmld: logger.print("a query-mld matches with site-mld -> not phish") return 0, 'not phish', set() else: logger.print("no query-mld matches with site-mld") if prominent_domains_found: logger.print("prominent domains found -> phish") return 2, 'phish', prominent_domains_found else: logger.print("did not find prominent mlds") logger.print("doing ocr") js = _ocr_on_json(jspath) ocrkeyw = keywords.keywords(js, max_count=MAXCOUNT, augment=True, use_ocr=True) keywstring = ' '.join(ocrkeyw) logger.print("ocr keywords: {}".format(keywstring)) urls = fetch_urls(keywstring) google_domains = extract_domains(urls) prominent_domains_found = prominent_domains(js, ocrkeyw, google_domains, extend_search=True) mld_in_gmld = (mld, ps) in google_domains if mld_in_gmld: logger.print( "a query-mld matches with site-mld -> not phish") return 0, 'not phish', set() else: logger.print("no query-mld matches with site-mld") if prominent_domains_found: logger.print("prominent domains found -> phish") return 2, 'phish', prominent_domains_found else: logger.print( "prominent domains not found -> possibly phish") return 1, 'suspicious', set() else: # logger.print("password not found") prominent_domains_found = prominent_domains(js, keyw, google_domains, extend_search=False) mld_in_gmld = (mld, ps) in google_domains if mld_in_gmld: logger.print("a query-mld matches with site-mld -> not phish") return 0, 'not phish', set() else: logger.print("no query-mld matches with site-mld") if prominent_domains_found: logger.print("prominent domains found -> suspicious") return 1, 'suspicious', prominent_domains_found else: logger.print("prominent domains not found -> not phish") return 0, 'not phish', set()
entity_names.extend(extract_text(child)) return entity_names def save_results(config, data): connection, cursor = connect(**config) cursor.execute("""CREATE TABLE IF NOT EXISTS named_entities( id INT(6) UNSIGNED AUTO_INCREMENT PRIMARY KEY, first_name VARCHAR(50) NOT NULL, last_name VARCHAR(70) NOT NULL, job_title VARCHAR(255) NOT NULL, email VARCHAR(100) NOT NULL, url VARCHAR(255) NOT NULL)""") prepared_data = [tuple(item.values()) for item in data] cursor.executemany(( "INSERT INTO named_entities (first_name, last_name, job_title, email, url) " "VALUES (%s, %s, %s, %s, %s)"), prepared_data) connection.commit() row_count = cursor.rowcount cursor.close() return row_count if __name__ == "__main__": titles = keywords().get('academic_title', []) config = get_db_config() data = get_data(config['remote']) entities = use_stanford_ner(data) save_results(config['local'], entities)
import pandas as pd #读取数据 data = pd.read_csv("../data/data.csv") #根据出版社将数据分开 data1 = data.iloc[[x for x in range(127)], :] #出版社:ANNALS OF STATISTICS data2 = data.iloc[[x for x in range(127, 174)], :] #出版社:ROYAL data3 = data.iloc[[x for x in range(174, 318)], :] #出版社:AMERICAN data4 = data.iloc[[x for x in range(318, 393)], :] #出版社:BIOMETRIKA data_list = [data1, data2, data3, data4] #不同出版社关键字统计 from keywords import keywords for i in data_list: keywords(i) print("\n") #不同出版社作者信息统计 from author_info import authorInfo for i in data_list: authorInfo(i) print("\n") # 不同出版社作者网络关系 from author_relation import draw_relation for i in data_list: draw_relation(i) print("\n")
def run(self): self.variables.sc = SlackClient(token) logger('crit', BOT_NAME + " Connected!") if self.variables.sc.rtm_connect(): bot_mention = "<@{}".format( self.variables.sc.server.login_data["self"]["id"]) while self.should_run: for evt in self.variables.sc.rtm_read(): # Original Command Parsing if "type" in evt and evt[ "type"] == "message" and "text" in evt: # This allows replies in the threads. Current issue is picture shows up # as bot shadow in threaded conversation summary if "thread_ts" not in evt: evt.update({'thread_ts': ''}) # This seems like the right place to snoop count = keywords(self.variables.sc, evt, self.variables.yamldata, self.variables.keyword_count) self.variables.count = count message = evt["text"].strip() # This logic was used for misdirection. IT allows you to send a message # starting with a channel name to have botiana deliver it there. Clever. channel = '' command = '' if "channel" in evt and evt[ "type"] == "message" and evt[ "channel"].startswith("D"): try: _, channel, command, message = message.split( None, 3) except ValueError: try: _, channel, command = message.split( None, 2) except ValueError: pass if channel.startswith("<#C"): logger('warn', 'misdirection module invoked') misdirected_channel = str( re.findall(r'\w+', channel)[0]) evt["channel"] = misdirected_channel message = bot_mention + "> " + command + " " + message if message.startswith(bot_mention): try: # have a botname, command, and message? _, command, message = message.split(None, 2) message_router(self.variables, bot_mention, evt, command, message) except ValueError: try: # maybe just a botname and command? _, command = message.split(None, 1) message_router(self.variables, bot_mention, evt, command, '') except ValueError: # this should never happen.... logger( 'info', "value error in command parsing - this should \ never happen") elif bot_mention in message: if enable_message_processing is True: try: logger( 'info', "routing message to message_processing" ) message_router(self.variables, bot_mention, evt, message_processing_module, message) except ValueError: logger( 'info', "failed to send message to message_processing module" ) self.variables.current_time = time.time() time.sleep(.1) else: if self.variables.sc.server.login_data is None: logger('crit', "Connection failed. Probably a bad/missing token.") else: logger( 'crit', "Connection failed. Server response: {}".format( self.variables.sc.server.login_data["ok"]))