def build_query_domains(js):

    google_domains = set()

    # google search with mld_guesses
    mld_guesses = keywords.guess_mld(js)
    if mld_guesses:
        mld_guess_str = ' '.join(['"{}"'.format(x) for x in mld_guesses])
        logger.print("mld guesses: {}".format(mld_guess_str))
        urls = fetch_urls(mld_guess_str)
        google_domains |= extract_domains(urls)
    else:
        logger.print("no mld guesses")

    langid = 'en'
    # langid = goslate.Goslate().detect(js['text'])
    logger.print("langid: {}".format(langid))
    # google search with keywords
    keyw = keywords.keywords(js, max_count=MAXCOUNT, boost=True, langid=langid)
    # keyw = keywords.keywords(js, max_count=MAXCOUNT, augment=False)
    keywstring = ' '.join(keyw)
    if keywstring:
        logger.print("keywords: {}".format(keywstring))
        urls = fetch_urls(keywstring)
        google_domains |= extract_domains(urls)
    else:
        logger.print("no keywords")

    # google search with augmented keywords
    augkeyw = keywords.keywords(js,
                                max_count=MAXCOUNT,
                                augment=True,
                                langid=langid)
    augkeywstring = ' '.join(augkeyw)
    if augkeywstring != keywstring:
        logger.print("augmented keywords: {}".format(augkeywstring))
        urls = fetch_urls(augkeywstring)
        google_domains |= extract_domains(urls)

    return google_domains, keyw, augkeyw
Example #2
0
	def __init__( self ):
		self.lib_version		= '1.0.0'
		self.api_key			= None
		self.api_private		= None
		self.base_url			= 'https://rest.quiubas.com'
		self.version			= '1.0'

		self.network = network( self )

		self.balance = balance( self )
		self.callback = callback( self )
		self.keywords = keywords( self )
		self.sms = sms( self )
Example #3
0
def textrank(fileName,
             original='pagerank',
             summarize_by=SENTENCE,
             ratio=0.2,
             words=None):

    path = '../Raw_text/' + fileName
    File = open(path)  #open file
    text = File.read()  #read all lines

    if original == 'suraj':
        #print 'with lexchain'
        namscores = LexicalChain(fileName=path)
    else:
        namscores = []
    #print namscores
    if summarize_by == SENTENCE:
        return summarize(text, namscores, original, ratio, words)
    else:
        return keywords(text, ratio, words)
Example #4
0
    def detail_case(self, browser, case, casecount):
        a = time.time()
        failcount = 0
        faillist = []
        for item in range(len(case)):

            if case[0] != "":
                try:
                    c = case[0]
                    c = c.split(":")
                    c1 = keywords.keywords()

                    flag = c1.filter(browser, c[0])
                    if flag == 1:
                        case.remove(case[0])

                    elif flag == 0:
                        print("failed: %s" % (c[0]))
                        faillist.append("Test Case%02d:%s" % (casecount, c[0]))
                        case[0] = ""

                except Exception as e:
                    print(e)
                    print("failed2222222 : %s:%s" % (c[0], c[1]))

        b = time.time()

        if flag == 1:
            print("1 case pass, in %.3f s\n" % (b - a))
        elif flag == 0:
            failcount += 1
            print("1 case failed, in %.3f s\n" % (b - a))

        time.sleep(1)
        browser.quit()

        return (failcount, faillist)
Example #5
0
File: api.py Project: rayleyva/jedi
    def goto_definitions(self):
        """
        Return the definitions of a the path under the cursor.  goto function!
        This follows complicated paths and returns the end, not the first
        definition. The big difference between :meth:`goto_assignments` and
        :meth:`goto_definitions` is that :meth:`goto_assignments` doesn't
        follow imports and statements. Multiple objects may be returned,
        because Python itself is a dynamic language, which means depending on
        an option you can have two different versions of a function.

        :rtype: list of :class:`api_classes.Definition`
        """
        def resolve_import_paths(scopes):
            for s in scopes.copy():
                if isinstance(s, imports.ImportPath):
                    scopes.remove(s)
                    scopes.update(resolve_import_paths(set(s.follow())))
            return scopes

        goto_path = self._module.get_path_under_cursor()

        context = self._module.get_context()
        scopes = set()
        lower_priority_operators = ('()', '(', ',')
        """Operators that could hide callee."""
        if next(context) in ('class', 'def'):
            scopes = set([self._module.parser.user_scope])
        elif not goto_path:
            op = self._module.get_operator_under_cursor()
            if op and op not in lower_priority_operators:
                scopes = set([keywords.get_operator(op, self._pos)])

        # Fetch definition of callee, if there's no path otherwise.
        if not goto_path:
            (call, _) = self._func_call_and_param_index()
            if call is not None:
                while call.next is not None:
                    call = call.next
                # reset cursor position:
                (row, col) = call.name.end_pos
                _pos = (row, max(col - 1, 0))
                self._module = modules.ModuleWithCursor(self._source_path,
                                                        source=self.source,
                                                        position=_pos)
                # then try to find the path again
                goto_path = self._module.get_path_under_cursor()

        if not scopes:
            if goto_path:
                scopes = set(self._prepare_goto(goto_path))
            elif op in lower_priority_operators:
                scopes = set([keywords.get_operator(op, self._pos)])

        scopes = resolve_import_paths(scopes)

        # add keywords
        scopes |= keywords.keywords(string=goto_path, pos=self._pos)

        d = set([
            api_classes.Definition(s) for s in scopes
            if not isinstance(s, imports.ImportPath._GlobalNamespace)
        ])
        return self._sorted_defs(d)
Example #6
0
    def goto_definitions(self):
        """
        Return the definitions of a the path under the cursor.  goto function!
        This follows complicated paths and returns the end, not the first
        definition. The big difference between :meth:`goto_assignments` and
        :meth:`goto_definitions` is that :meth:`goto_assignments` doesn't
        follow imports and statements. Multiple objects may be returned,
        because Python itself is a dynamic language, which means depending on
        an option you can have two different versions of a function.

        :rtype: list of :class:`api_classes.Definition`
        """
        def resolve_import_paths(scopes):
            for s in scopes.copy():
                if isinstance(s, imports.ImportPath):
                    scopes.remove(s)
                    scopes.update(resolve_import_paths(set(s.follow())))
            return scopes

        goto_path = self._module.get_path_under_cursor()

        context = self._module.get_context()
        scopes = set()
        lower_priority_operators = ('()', '(', ',')
        """Operators that could hide callee."""
        if next(context) in ('class', 'def'):
            scopes = set([self._module.parser.user_scope])
        elif not goto_path:
            op = self._module.get_operator_under_cursor()
            if op and op not in lower_priority_operators:
                scopes = set([keywords.get_operator(op, self._pos)])

        # Fetch definition of callee, if there's no path otherwise.
        if not goto_path:
            (call, _) = self._func_call_and_param_index()
            if call is not None:
                while call.next is not None:
                    call = call.next
                # reset cursor position:
                (row, col) = call.name.end_pos
                _pos = (row, max(col - 1, 0))
                self._module = modules.ModuleWithCursor(
                    self._source_path,
                    source=self.source,
                    position=_pos)
                # then try to find the path again
                goto_path = self._module.get_path_under_cursor()

        if not scopes:
            if goto_path:
                scopes = set(self._prepare_goto(goto_path))
            elif op in lower_priority_operators:
                scopes = set([keywords.get_operator(op, self._pos)])

        scopes = resolve_import_paths(scopes)

        # add keywords
        scopes |= keywords.keywords(string=goto_path, pos=self._pos)

        d = set([api_classes.Definition(s) for s in scopes
                 if s is not imports.ImportPath.GlobalNamespace])
        return self._sorted_defs(d)
Example #7
0
def textrank(text, summarize_by=SENTENCE, ratio=0.2, words=None):
    if summarize_by == SENTENCE:
        return summarize(text, ratio, words)
    else:
        return keywords(text, ratio, words)
Example #8
0
def textrank(text1, text2, summarize_by=SENTENCE, ratio=DEFAULT_RATIO, words=None, additional_stopwords=None):
    if summarize_by == SENTENCE:
        return summarize(text1, text2, ratio, words, additional_stopwords=additional_stopwords)
    else:
        return keywords(text1, text2, ratio, words, additional_stopwords=additional_stopwords)
Example #9
0
keytrans = []
ini_tot = []
tot = []
for file_i in files:
    with codecs.open(path + '/' + file_i, 'r', 'UTF-8') as f:
        passage = f.read()
    lista = re.findall(r"[\w']+", passage)
    cap_list = [word.upper() for word in lista]
    scriptlist = RemStopW(cap_list, cap_stopwords)  #list
    script = ' '.join(scriptlist)  #script - cleaned
    # ==============
    # Find key-words
    # ==============
    # For each text find the 20-most common key-words (single=k1, bigram=k2 or trigram=k3)
    # excluding all the stop-words
    [ini_tot_words, tot_words, k1, k2, k3] = keywords(script)
    keytrans.append([k1, k2, k3])  #list of list - k1,k2,k3 for each text
    ini_tot.append(ini_tot_words)  # total # of words in the original text
    tot.append(tot_words)  # total # of words counted after cleaning
# ==================
# Printing some info
# ==================
for i, k in enumerate(keytrans):
    print('Total number of words in ' + files[i])
    print(str(tot[i]))
    print('The 10 most common single key-words in ' + files[i])
    for w in k[0][0:11]:
        print(str(w))
    print('The 5 most common bigram key-words in ' + files[i])
    for w in k[1][0:6]:
        print(str(w))
Example #10
0
 def main_case(self, case, casecount):
     browser, case = keywords.keywords().case_main(case)
     failcount, faillist = self.detail_case(browser, case, casecount)
     return (failcount, faillist)
def is_phish2(ws=None):
    """
    Decide whether a website is phishing using its keywords and a Google search
    based on those.

    Parameters
    ----------
    ws: website object or None
        contains all downloaded information about the site 

    Returns
    -------
    rank: int
        * -1 = unresolved, fetching a website failed
        *  0 = not phish
        *  1 = suspicious
        *  2 = phish
    description: str
        above description for the numerical values
    targets: set
        potential targets in case of 1 or 2, empty when rank is -1, 0
    """

    if ws is None:
        # logger.print("website object is None is empty; cannot continue")
        return -1, 'unresolved', set()

    # logger.print("siteid: {}".format(ws.siteid))
    # logger.print("landing url: {}".format(ws.landurl[:80]))
    mld, ps = keywords.split_mld_ps(ws.landurl)

    # 1. TESTS
    # password?
    pw = ws.has_password
    # if pw:
    #     logger.print("asks for a password")
    # else:
    #     logger.print("does not ask for a password")

    google_domains, keyw, augkeyw = extract_domains(
        ws.js['urls_keywords'] +
        ws.js['urls_augmented']), ws.keywords(), ws.augmented_keywords()
    # logger.print("query-domains:")
    # for dom in google_domains:
    #     logger.print('.'.join(dom), nots=True)

    if pw:
        # logger.print("password found")
        prominent_domains_found = prominent_domains(ws.js,
                                                    keyw + augkeyw,
                                                    google_domains,
                                                    extend_search=False)
        mld_in_gmld = (mld, ps) in google_domains
        if mld_in_gmld:
            # logger.print("a query-mld matches with site-mld -> not phish")
            # return 0
            return 0, 'not phish', set()
        else:
            # logger.print("no query-mld matches with site-mld")
            if prominent_domains_found:
                # logger.print("prominent domains found -> phish")
                # return 2
                return 2, 'phish', prominent_domains_found
            else:
                # logger.print("did not find prominent mlds")
                # logger.print("doing ocr")
                ocrkeyw = keywords.keywords(ws.js,
                                            max_count=MAXCOUNT,
                                            augment=True,
                                            use_ocr=True)
                keywstring = ' '.join(ocrkeyw)
                # logger.print("ocr keywords: {}".format(keywstring))
                urls = ws.js['urls_ocr']
                google_domains = extract_domains(urls)
                prominent_domains_found = prominent_domains(ws.js,
                                                            ocrkeyw,
                                                            google_domains,
                                                            extend_search=True)
                mld_in_gmld = (mld, ps) in google_domains
                if mld_in_gmld:
                    # logger.print("a query-mld matches with site-mld -> not phish")
                    # return 0
                    return 0, 'not phish', set()
                else:
                    # logger.print("no query-mld matches with site-mld")
                    if prominent_domains_found:
                        # logger.print("prominent domains found -> phish")
                        # return 2
                        return 2, 'phish', prominent_domains_found
                    else:
                        # logger.print("prominent domains not found -> possibly phish")
                        # return 1
                        return 1, 'suspicious', set()
    else:
        # logger.print("password not found")
        prominent_domains_found = prominent_domains(ws.js,
                                                    keyw,
                                                    google_domains,
                                                    extend_search=False)
        mld_in_gmld = (mld, ps) in google_domains
        if mld_in_gmld:
            # logger.print("a query-mld matches with site-mld -> not phish")
            # return 0
            return 0, 'not phish', set()
        else:
            # logger.print("no query-mld matches with site-mld")
            if prominent_domains_found:
                # logger.print("prominent domains found -> suspicious")
                # return 1
                return 1, 'suspicious', prominent_domains_found
            else:
                # logger.print("prominent domains not found -> not phish")
                # return 0
                return 0, 'not phish', set()
def is_phish(js={}, jspath='', url=''):
    """
    Decide whether a website is phishing using its keywords and a Google search
    based on those.

    Parameters
    ----------
    js: dict, optional
        contains site data
    jspath: str, optional
        path to a json file with the site data
    url: str, optional
        url of a website

    Returns
    -------
    rank: int
        * -1 = unresolved, fetching a website failed
        *  0 = not phish
        *  1 = suspicious
        *  2 = phish
    description: str
        above description for the numerical values
    targets: set
        potential targets in case of 1 or 2, empty when rank is -1, 0
    """

    # load json
    if url:
        fetcher = website_fetcher.WebsiteFetcher(logging=True, confirm=True)
        # sitedata, screenshot = fetcher.fetch_sitedata_and_screenshot(url)
        # js = sitedata
        jspath, sspath = fetcher.fetch_and_save_data(url)
        js = _load_json(jspath)
    elif jspath:
        js = _load_json(jspath)
        sspath = _get_screenshot_path(jspath)
    if not js:
        logger.print("json file is empty; cannot continue")
        return -1, 'unresolved', set()

    logger.print("siteid: {}".format(js['siteid']))
    landurl = js['landurl']
    # logger.print("landing url: {}".format(landurl[:80]))
    logger.print("loglinks:")
    for link in js['loglinks']:
        logger.print(link, nots=True)

    mld, ps = keywords.split_mld_ps(landurl)
    # logger.print("main level domain: {}".format(mld))

    # 1. TESTS
    # password?
    pw = _asks_password(js)
    if pw:
        logger.print("asks for a password")
    else:
        logger.print("does not ask for a password")

    google_domains, keyw, augkeyw = build_query_domains(js)
    logger.print("query-domains:")
    for dom in google_domains:
        logger.print('.'.join(dom), nots=True)

    if pw:
        # logger.print("asks for a password")
        prominent_domains_found = prominent_domains(js,
                                                    keyw + augkeyw,
                                                    google_domains,
                                                    extend_search=False)
        mld_in_gmld = (mld, ps) in google_domains
        if mld_in_gmld:
            logger.print("a query-mld matches with site-mld -> not phish")
            return 0, 'not phish', set()
        else:
            logger.print("no query-mld matches with site-mld")
            if prominent_domains_found:
                logger.print("prominent domains found -> phish")
                return 2, 'phish', prominent_domains_found
            else:
                logger.print("did not find prominent mlds")
                logger.print("doing ocr")
                js = _ocr_on_json(jspath)
                ocrkeyw = keywords.keywords(js,
                                            max_count=MAXCOUNT,
                                            augment=True,
                                            use_ocr=True)
                keywstring = ' '.join(ocrkeyw)
                logger.print("ocr keywords: {}".format(keywstring))
                urls = fetch_urls(keywstring)
                google_domains = extract_domains(urls)
                prominent_domains_found = prominent_domains(js,
                                                            ocrkeyw,
                                                            google_domains,
                                                            extend_search=True)
                mld_in_gmld = (mld, ps) in google_domains
                if mld_in_gmld:
                    logger.print(
                        "a query-mld matches with site-mld -> not phish")
                    return 0, 'not phish', set()
                else:
                    logger.print("no query-mld matches with site-mld")
                    if prominent_domains_found:
                        logger.print("prominent domains found -> phish")
                        return 2, 'phish', prominent_domains_found
                    else:
                        logger.print(
                            "prominent domains not found -> possibly phish")
                        return 1, 'suspicious', set()
    else:
        # logger.print("password not found")
        prominent_domains_found = prominent_domains(js,
                                                    keyw,
                                                    google_domains,
                                                    extend_search=False)
        mld_in_gmld = (mld, ps) in google_domains
        if mld_in_gmld:
            logger.print("a query-mld matches with site-mld -> not phish")
            return 0, 'not phish', set()
        else:
            logger.print("no query-mld matches with site-mld")
            if prominent_domains_found:
                logger.print("prominent domains found -> suspicious")
                return 1, 'suspicious', prominent_domains_found
            else:
                logger.print("prominent domains not found -> not phish")
                return 0, 'not phish', set()
Example #13
0
                entity_names.extend(extract_text(child))

    return entity_names


def save_results(config, data):
    connection, cursor = connect(**config)
    cursor.execute("""CREATE TABLE IF NOT EXISTS named_entities(
                        id INT(6) UNSIGNED AUTO_INCREMENT PRIMARY KEY,
                        first_name VARCHAR(50) NOT NULL,
                        last_name VARCHAR(70) NOT NULL,
                        job_title VARCHAR(255) NOT NULL,
                        email VARCHAR(100) NOT NULL,
                        url VARCHAR(255) NOT NULL)""")
    prepared_data = [tuple(item.values()) for item in data]
    cursor.executemany((
        "INSERT INTO named_entities (first_name, last_name, job_title, email, url) "
        "VALUES (%s, %s, %s, %s, %s)"), prepared_data)
    connection.commit()
    row_count = cursor.rowcount
    cursor.close()
    return row_count


if __name__ == "__main__":
    titles = keywords().get('academic_title', [])
    config = get_db_config()
    data = get_data(config['remote'])
    entities = use_stanford_ner(data)
    save_results(config['local'], entities)
Example #14
0
import pandas as pd

#读取数据
data = pd.read_csv("../data/data.csv")

#根据出版社将数据分开
data1 = data.iloc[[x for x in range(127)], :]  #出版社:ANNALS OF STATISTICS
data2 = data.iloc[[x for x in range(127, 174)], :]  #出版社:ROYAL
data3 = data.iloc[[x for x in range(174, 318)], :]  #出版社:AMERICAN
data4 = data.iloc[[x for x in range(318, 393)], :]  #出版社:BIOMETRIKA
data_list = [data1, data2, data3, data4]

#不同出版社关键字统计
from keywords import keywords
for i in data_list:
    keywords(i)
    print("\n")

#不同出版社作者信息统计
from author_info import authorInfo
for i in data_list:
    authorInfo(i)
    print("\n")

# 不同出版社作者网络关系
from author_relation import draw_relation
for i in data_list:
    draw_relation(i)
    print("\n")
Example #15
0
    def run(self):
        self.variables.sc = SlackClient(token)
        logger('crit', BOT_NAME + " Connected!")
        if self.variables.sc.rtm_connect():
            bot_mention = "<@{}".format(
                self.variables.sc.server.login_data["self"]["id"])
            while self.should_run:
                for evt in self.variables.sc.rtm_read():
                    # Original Command Parsing
                    if "type" in evt and evt[
                            "type"] == "message" and "text" in evt:

                        # This allows replies in the threads. Current issue is picture shows up
                        # as bot shadow in threaded conversation summary
                        if "thread_ts" not in evt:
                            evt.update({'thread_ts': ''})

                        # This seems like the right place to snoop
                        count = keywords(self.variables.sc, evt,
                                         self.variables.yamldata,
                                         self.variables.keyword_count)
                        self.variables.count = count

                        message = evt["text"].strip()

                        # This logic was used for misdirection. IT allows you to send a message
                        # starting with a channel name to have botiana deliver it there. Clever.
                        channel = ''
                        command = ''
                        if "channel" in evt and evt[
                                "type"] == "message" and evt[
                                    "channel"].startswith("D"):
                            try:
                                _, channel, command, message = message.split(
                                    None, 3)
                            except ValueError:
                                try:
                                    _, channel, command = message.split(
                                        None, 2)
                                except ValueError:
                                    pass
                            if channel.startswith("<#C"):
                                logger('warn', 'misdirection module invoked')
                                misdirected_channel = str(
                                    re.findall(r'\w+', channel)[0])
                                evt["channel"] = misdirected_channel
                                message = bot_mention + "> " + command + " " + message

                        if message.startswith(bot_mention):
                            try:
                                # have a botname, command, and message?
                                _, command, message = message.split(None, 2)
                                message_router(self.variables, bot_mention,
                                               evt, command, message)
                            except ValueError:
                                try:
                                    # maybe just a botname and command?
                                    _, command = message.split(None, 1)
                                    message_router(self.variables, bot_mention,
                                                   evt, command, '')
                                except ValueError:
                                    # this should never happen....
                                    logger(
                                        'info',
                                        "value error in command parsing - this should \
                                                   never happen")
                        elif bot_mention in message:
                            if enable_message_processing is True:
                                try:
                                    logger(
                                        'info',
                                        "routing message to message_processing"
                                    )
                                    message_router(self.variables, bot_mention,
                                                   evt,
                                                   message_processing_module,
                                                   message)
                                except ValueError:
                                    logger(
                                        'info',
                                        "failed to send message to message_processing module"
                                    )

                self.variables.current_time = time.time()
                time.sleep(.1)
        else:
            if self.variables.sc.server.login_data is None:
                logger('crit',
                       "Connection failed. Probably a bad/missing token.")
            else:
                logger(
                    'crit', "Connection failed. Server response: {}".format(
                        self.variables.sc.server.login_data["ok"]))