Beispiel #1
0
    def test_re_escape(self):
        p = ""
        # This had to change from the original test of range(0,256)
        # because we can't support non-ascii non-utf8 strings
        for i in range(0, 128):
            p = p + chr(i)
            self.assertEqual(re.match(re.escape(chr(i)), chr(i)) is not None,
                             True)
            self.assertEqual(re.match(re.escape(chr(i)), chr(i)).span(), (0,1))

        pat = re.compile(re.escape(p))
        self.assertEqual(pat.match(p) is not None, True)
        self.assertEqual(pat.match(p).span(), (0,128))
Beispiel #2
0
    def test_re_escape(self):
        p=""
        # This had to change from the original test of range(0,256)
        # because we can't support non-ascii non-utf8 strings
        for i in range(0, 128):
            p = p + chr(i)
            self.assertEqual(re.match(re.escape(chr(i)), chr(i)) is not None,
                             True)
            self.assertEqual(re.match(re.escape(chr(i)), chr(i)).span(), (0,1))

        pat=re.compile(re.escape(p))
        self.assertEqual(pat.match(p) is not None, True)
        self.assertEqual(pat.match(p).span(), (0,128))
Beispiel #3
0
    def get_pattern(self, subject, modifiers):
        # cast to lists, so we're not splitting a single string
        if not isinstance(getattr(self, subject), list):
            setattr(self, subject, [getattr(self, subject)])
        if not isinstance(modifiers, list):
            modifiers = list(modifiers.split(' '))

        # cast all elements to strings in case of any numbers
        values = [unicode(val) for val in getattr(self, subject)]

        if 'regex' not in modifiers:
            values = [re.escape(val) for val in values]
        value_str = u'({0})'.format('|'.join(values))

        # check if they defined a match modifier
        for mod in self._match_modifiers:
            if mod in modifiers:
                match_mod = mod
                break
        else:
            subject = self.trimmed_key(subject)
            # handle subdomains for domain checks
            if subject == 'domain':
                value_str = ur'(?:.*?\.)?' + value_str

            match_mod = self._modifier_defaults.get(subject, 'includes-word')

        return self._match_modifiers[match_mod].format(value_str)
Beispiel #4
0
    def get_pattern(self, subject, modifiers):
        # cast to lists, so we're not splitting a single string
        if not isinstance(getattr(self, subject), list):
            setattr(self, subject, [getattr(self, subject)])
        if not isinstance(modifiers, list):
            modifiers = list(modifiers.split(' '))

        # cast all elements to strings in case of any numbers
        values = [unicode(val) for val in getattr(self, subject)]

        if 'regex' not in modifiers:
            values = [re.escape(val) for val in values]
        value_str = u'({0})'.format('|'.join(values))

        # check if they defined a match modifier
        for mod in self._match_modifiers:
            if mod in modifiers:
                match_mod = mod
                break
        else:
            subject = self.trimmed_key(subject)
            # handle subdomains for domain checks
            if subject == 'domain':
                value_str = ur'(?:.*?\.)?' + value_str

            match_mod = self._modifier_defaults.get(subject, 'includes-word')

        return self._match_modifiers[match_mod].format(value_str)
Beispiel #5
0
    def test_basic_re_sub(self):
        self.assertEqual(re.sub("(?i)b+", "x", "bbbb BBBB"), 'x x')
        self.assertEqual(re.sub(r'\d+', self.bump_num, '08.2 -2 23x99y'),
                         '9.3 -3 24x100y')
        self.assertEqual(re.sub(r'\d+', self.bump_num, '08.2 -2 23x99y', 3),
                         '9.3 -3 23x99y')

        self.assertEqual(re.sub('.', lambda m: r"\n", 'x'), '\\n')
        self.assertEqual(re.sub('.', r"\n", 'x'), '\n')

        s = r"\1\1"
        self.assertEqual(re.sub('(.)', s, 'x'), 'xx')
        self.assertEqual(re.sub('(.)', re.escape(s), 'x'), s)
        self.assertEqual(re.sub('(.)', lambda m: s, 'x'), s)

        self.assertEqual(re.sub('(?P<a>x)', '\g<a>\g<a>', 'xx'), 'xxxx')
        self.assertEqual(re.sub('(?P<a>x)', '\g<a>\g<1>', 'xx'), 'xxxx')
        self.assertEqual(re.sub('(?P<unk>x)', '\g<unk>\g<unk>', 'xx'), 'xxxx')
        self.assertEqual(re.sub('(?P<unk>x)', '\g<1>\g<1>', 'xx'), 'xxxx')

        self.assertEqual(
            re.sub('a', r'\t\n\v\r\f\a\b\B\Z\a\A\w\W\s\S\d\D', 'a'),
            '\t\n\v\r\f\a\b\\B\\Z\a\\A\\w\\W\\s\\S\\d\\D')
        self.assertEqual(re.sub('a', '\t\n\v\r\f\a', 'a'), '\t\n\v\r\f\a')
        self.assertEqual(
            re.sub('a', '\t\n\v\r\f\a', 'a'),
            (chr(9) + chr(10) + chr(11) + chr(13) + chr(12) + chr(7)))

        self.assertEqual(re.sub('^\s*', 'X', 'test'), 'Xtest')
Beispiel #6
0
    def test_basic_re_sub(self):
        self.assertEqual(re.sub("(?i)b+", "x", "bbbb BBBB"), 'x x')
        self.assertEqual(re.sub(r'\d+', self.bump_num, '08.2 -2 23x99y'),
                         '9.3 -3 24x100y')
        self.assertEqual(re.sub(r'\d+', self.bump_num, '08.2 -2 23x99y', 3),
                         '9.3 -3 23x99y')

        self.assertEqual(re.sub('.', lambda m: r"\n", 'x'), '\\n')
        self.assertEqual(re.sub('.', r"\n", 'x'), '\n')

        s = r"\1\1"
        self.assertEqual(re.sub('(.)', s, 'x'), 'xx')
        self.assertEqual(re.sub('(.)', re.escape(s), 'x'), s)
        self.assertEqual(re.sub('(.)', lambda m: s, 'x'), s)

        self.assertEqual(re.sub('(?P<a>x)', '\g<a>\g<a>', 'xx'), 'xxxx')
        self.assertEqual(re.sub('(?P<a>x)', '\g<a>\g<1>', 'xx'), 'xxxx')
        self.assertEqual(re.sub('(?P<unk>x)', '\g<unk>\g<unk>', 'xx'), 'xxxx')
        self.assertEqual(re.sub('(?P<unk>x)', '\g<1>\g<1>', 'xx'), 'xxxx')

        self.assertEqual(re.sub('a',r'\t\n\v\r\f\a\b\B\Z\a\A\w\W\s\S\d\D','a'),
                         '\t\n\v\r\f\a\b\\B\\Z\a\\A\\w\\W\\s\\S\\d\\D')
        self.assertEqual(re.sub('a', '\t\n\v\r\f\a', 'a'), '\t\n\v\r\f\a')
        self.assertEqual(re.sub('a', '\t\n\v\r\f\a', 'a'),
                         (chr(9)+chr(10)+chr(11)+chr(13)+chr(12)+chr(7)))

        self.assertEqual(re.sub('^\s*', 'X', 'test'), 'Xtest')
Beispiel #7
0
    def test_basic_re_sub(self):
        self.assertEqual(re.sub(b"(?i)b+", b"x", b"bbbb BBBB"), b'x x')
        self.assertEqual(re.sub(b'\\d+', self.bump_num, b'08.2 -2 23x99y'),
                         b'9.3 -3 24x100y')
        self.assertEqual(re.sub(b'\\d+', self.bump_num, b'08.2 -2 23x99y', 3),
                         b'9.3 -3 23x99y')

        self.assertEqual(re.sub(b'.', lambda m: b"\\n", b'x'), b'\\n')
        self.assertEqual(re.sub(b'.', b"\\n", b'x'), b'\n')

        s = b"\\1\\1"
        self.assertEqual(re.sub(b'(.)', s, b'x'), b'xx')
        self.assertEqual(re.sub(b'(.)', re.escape(s), b'x'), s)
        self.assertEqual(re.sub(b'(.)', lambda m: s, b'x'), s)

        self.assertEqual(re.sub(b'(?P<a>x)', b'\g<a>\g<a>', b'xx'), b'xxxx')
        self.assertEqual(re.sub(b'(?P<a>x)', b'\g<a>\g<1>', b'xx'), b'xxxx')
        self.assertEqual(re.sub(b'(?P<unk>x)', b'\g<unk>\g<unk>', b'xx'),
                         b'xxxx')
        self.assertEqual(re.sub(b'(?P<unk>x)', b'\g<1>\g<1>', b'xx'), b'xxxx')

        self.assertEqual(
            re.sub(b'a',
                   b'\\t\\n\\v\\r\\f\\a\\b\\B\\Z\\a\\A\\w\\W\\s\\S\\d\\D',
                   b'a'), b'\t\n\v\r\f\a\b\\B\\Z\a\\A\\w\\W\\s\\S\\d\\D')
        self.assertEqual(re.sub(b'a', b'\t\n\v\r\f\a', b'a'), b'\t\n\v\r\f\a')
        self.assertEqual(re.sub(b'a', b'\t\n\v\r\f\a', b'a'),
                         (chr(9) + chr(10) + chr(11) + chr(13) + chr(12) +
                          chr(7)).encode('utf-8'))

        self.assertEqual(re.sub(b'^\s*', b'X', b'test'), b'Xtest')
Beispiel #8
0
def keyword_stem_regex(tokenizer, keyword, name=None):
    keyword_re = []
    word_tokens = word_segs(tokenizer, keyword.lower())
    for word in word_tokens:
        allow_ending = (
            all((tok.isalpha() for tok in word)) and len(word) >= 2
        )
        letters_stemmed = len("".join((w.decode("utf-8") for w in word[:-1])))
        do_stem = allow_ending and len(word) >= 3 and letters_stemmed >= 4
        if do_stem:
            del word[-1]
        joined = b"".join(word)
        keyword_re.append(
            re2.escape(joined) +
            (br"\pL*" if allow_ending else b"")
        )
    if not keyword_re:
        return None
    if name is not None:
        capture_group = b"?P<%s>" % name.encode("utf-8")
    else:
        capture_group = b""
    return (
        br"(" + capture_group +
        br"\s+".join(keyword_re) +
        br")"
    )
Beispiel #9
0
    def fit(self):
        title_regs = []
        for ne in tqdm(self.named_entities, desc="fit ner"):
            for title in ne.get_titles(lang="en", with_disambiguation=False):
                title_regs += [
                    re.escape(expansion)
                    for expansion in TextNormalizer.get_rabbi_expansions(title)
                ]
        title_regs.sort(key=lambda x: len(x), reverse=True)
        word_breakers = r"|".join(
            re.escape(breaker) for breaker in [
                '.', ',', '"', '?', '!', '(', ')', '[', ']', '{', '}', ':',
                ';', '§', '<', '>', "'s"
            ])

        self.named_entity_regex = re.compile(
            fr"(?:^|\s|{word_breakers})({'|'.join(title_regs)})(?:\s|{word_breakers}|$)"
        )
Beispiel #10
0
 def get_rabbi_regex(cls, rabbi):
     reg = rabbi.replace(
         cls.b_token,
         f"(?:{u'|'.join(re.escape(b) for b in cls.b_replacements)})")
     for starter in cls.starting_replacements:
         starter = re.escape(starter)
         reg = re.sub(f'^{starter}', f"(?:{starter.lower()}|{starter})",
                      reg)
     return reg
    def run(self):
        ip_indicators = [
                "204.93.183.196",
                "50.31.146.109",
                "5.135.208.53",
                "103.25.59.120",
                "50.97.99.2",
                "173.203.112.215",
                "27.124.127.10",
                "78.129.181.191",
                "204.197.254.94",
                "50.31.146.134",
        ]

        match_file = self.check_file(pattern=".*\\\\Application\\ Data\\\\Microsoft\\\\[a-z]{3}(api32|audio|bios|boot|cap32|common|config|crypt|edit32|error|mgr32|serial|setup|share|sock|system|update|video|windows)\.exe$", regex=True, all=True)
        match_batch_file = self.check_file(pattern=".*\\\\Application\\ Data\\\\\d{1,10}\.bat$", regex=True, all=True)
        match_runkey = self.check_key(pattern=".*\\\\Microsoft\\\\Windows\\\\CurrentVersion\\\\Run\\\\[a-z]{3}(api32|audio|bios|boot|cap32|common|config|crypt|edit32|error|mgr32|serial|setup|share|sock|system|update|video|windows)\.exe$", regex=True, all=True)
        match_otherkey = self.check_key(pattern=".*\\\\Microsoft\\\\Office\\\\Common\\\\(?P<hex>[A-F0-9]+)\\\\(?P=hex)(CS|PS|SS|RS)", regex=True, all=True)
        match_mutex = self.check_mutex(pattern="^[A-F0-9]{1,8}(I|M|RM)$", regex=True, all=True)
        found_match_ip = False
        found_match_url = False
        if match_file:
            for match in match_file:
                self.data.append({"file": match})
        if match_batch_file:
            for match in match_batch_file:
                self.data.append({"batchfile": match})
        if match_runkey:
            for match in match_runkey:
                self.data.append({"runkey": match})
        if match_otherkey:
            for match in match_otherkey:
                self.data.append({"otherkey": match})
        if match_mutex:
            for match in match_mutex:
                self.data.append({"mutex": match})
        for ip_indicator in ip_indicators:
            match_ip = self.check_ip(pattern=ip_indicator)
            if match_ip:
                self.data.append({"ip": match_ip})
                found_match_ip = True
            match_url = self.check_url(pattern="http://" + re.escape(ip_indicator) + ":8080/[a-f0-9]{1,8}/[a-f0-9]{1,8}/", regex=True,all=True)
            if match_url:
                for match in match_url:
                    self.data.append({"url": match})
                found_match_url = True

        if match_file or match_batch_file or match_mutex or found_match_ip or found_match_url or match_runkey or match_otherkey:
                return True

        return False
Beispiel #12
0
    def handleEvent(self, event):
        eventName = event.eventType
        srcModuleName = event.module
        eventData = event.data

        self.sf.debug("Received event, " + eventName + ", from " +
                      srcModuleName)

        # Don't look up stuff twice
        if self.results.has_key(eventData):
            self.sf.debug("Skipping " + eventData + " as already mapped.")
            return None
        else:
            self.results[eventData] = True

        data = self.query(eventData)
        if data == None:
            return None

        for n in data:
            e = SpiderFootEvent("LEAKSITE_URL", n, self.__name__, event)
            self.notifyListeners(e)

            res = self.sf.fetchUrl(n,
                                   timeout=self.opts['_fetchtimeout'],
                                   useragent=self.opts['_useragent'])

            if res['content'] is None:
                self.sf.debug("Ignoring " + n + " as no data returned")
                continue

            # Sometimes pastes search results false positives
            if re.search(
                    "[^a-zA-Z\-\_0-9]" + re.escape(eventData) +
                    "[^a-zA-Z\-\_0-9]", res['content'], re.IGNORECASE) is None:
                continue

            try:
                startIndex = res['content'].index(eventData)
            except BaseException as e:
                self.sf.debug("String not found in pastes content.")
                continue

            evt = SpiderFootEvent("LEAKSITE_CONTENT", res['content'],
                                  self.__name__, e)
            self.notifyListeners(evt)
Beispiel #13
0
    def _add(self, message: IRCMessage):
        """add - Adds a quote to the OutOfContext log. The quote will be pulled from a message line buffer."""
        if len(message.parameterList) < 2:
            return IRCResponse(ResponseType.Say, "Add what?", message.replyTo)
        if message.targetType == TargetTypes.USER:
            return IRCResponse(ResponseType.Say,
                               "You can only add messages from channels.",
                               message.replyTo)

        regex = re2.compile(re2.escape(" ".join(message.parameterList[1:])),
                            re2.IGNORECASE)
        if len(self.messageStore
               ) == 0 or message.channel not in self.messageStore:
            return IRCResponse(ResponseType.Say,
                               "Sorry, there are no messages in my buffer.",
                               message.replyTo)

        matches = list(filter(regex.search,
                              self.messageStore[message.channel]))
        if len(matches) == 0:
            return IRCResponse(
                ResponseType.Say,
                "Sorry, that didn't match anything in my message buffer.",
                message.replyTo)
        if len(matches) > 1:
            return IRCResponse(
                ResponseType.Say,
                "Sorry, that matches too many lines in my message buffer.",
                message.replyTo)

        todayDate = time.strftime("[%Y-%m-%d] [%H:%M]")
        quote = f"{todayDate} {matches[0]}"
        if message.replyTo not in self.storage:
            self.storage[message.replyTo] = []
        if len(self.storage[message.replyTo]) > 0 and self.storage[
                message.replyTo][-1] == quote:
            return IRCResponse(
                ResponseType.Say,
                "That quote has already been added to the log!",
                message.replyTo)
        else:
            self.storage[message.replyTo].append(quote)
            return IRCResponse(ResponseType.Say,
                               f"Quote '{quote}' was added to the log!",
                               message.replyTo)
Beispiel #14
0
def run():
    '''
    #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    start = time()
    # Browser
    br = mechanize.Browser()
    # Cookie Jar
    cj = cookielib.LWPCookieJar()
    br.set_cookiejar(cj)
    # Browser options
    br.set_handle_equiv(True)
    #br.set_handle_gzip(True)
    br.set_handle_redirect(True)
    br.set_handle_referer(True)
    br.set_handle_robots(False)
    # Follows refresh 0 but not hangs on refresh > 0
    br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1)
    # Want debugging messages?
    #br.set_debug_http(True)
    #br.set_debug_redirects(True)
    #br.set_debug_responses(True)
    # User-Agent
    br.addheaders = [('User-Agent', 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11')]
    print "initiated browser: " + str(time()-start) + " seconds"
    
    #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    
    # volume/page of JAMA review articles from 2000/01 to 2013/04/1
    vol_pg_tuples = [('309', '1278'), ('309', '1163'), ('309', '926'), ('309', '919'), ('309', '814'), ('309', '706'), ('309', '678'), ('309', '594'), ('308', '2507'), ('309', '71'), ('308', '2612'), ('308', '1024'), ('308', '502'), ('307', '2526'), ('307', '2079'), ('307', '2418'), ('307', '1959'), ('307', '1185'), ('307', '1072'), ('307', '713'), ('307', '294'), ('307', '182'), ('306', '2704'), ('306', '2011'), ('306', '1782'), ('306', '1688'), ('306', '1359'), ('306', '1241'), ('306', '978'), ('306', '746'), ('306', '627'), ('306', '420'), ('305', '2335'), ('305', '1790'), ('305', '1327'), ('305', '1225'), ('305', '1119'), ('305', '1008'), ('305', '698'), ('305', '487'), ('305', '284'), ('305', '78'), ('304', '2628'), ('304', '2161'), ('304', '2048'), ('304', '1592'), ('304', '890'), ('304', '779'), ('304', '452'), ('304', '321'), ('304', '76'), ('303', '2280'), ('303', '1848'), ('303', '1738'), ('303', '1729'), ('303', '1526'), ('303', '1295'), ('303', '1180'), ('303', '1077'), ('303', '865'), ('303', '438'), ('303', '47'), ('302', '2679'), ('302', '2345'), ('302', '2243'), ('302', '2135'), ('302', '1316'), ('302', '985'), ('302', '550'), ('302', '537'), ('302', '412'), ('302', '179'), ('301', '2472'), ('301', '2362'), ('301', '2349'), ('302', '73'), ('301', '2129'), ('301', '1358'), ('301', '636'), ('301', '954'), ('301', '415'), ('301', '309'), ('300', '2886'), ('300', '2779'), ('300', '2754'), ('300', '2647'), ('300', '2638'), ('300', '2514'), ('301', '82'), ('300', '2407'), ('300', '2286'), ('300', '2277'), ('300', '2161'), ('300', '1793'), ('300', '1674'), ('300', '2036'), ('300', '1439'), ('300', '1181'), ('300', '711'), ('300', '555'), ('300', '197'), ('299', '2777'), ('299', '2423'), ('299', '1937'), ('299', '1698'), ('299', '1446'), ('299', '1320'), ('299', '1166'), ('299', '937'), ('299', '925'), ('299', '914'), ('299', '806'), ('299', '793'), ('299', '672'), ('299', '324'), ('299', '555'), ('298', '2895'), ('298', '2654'), ('298', '2296'), ('298', '2171'), ('298', '1911'), ('298', '1900'), ('298', '1429'), ('298', '1312'), ('298', '1300'), ('298', '1038'), ('298', '1023'), ('298', '902'), ('298', '786'), ('298', '655'), ('298', '438'), ('298', '194'), ('298', '70'), ('298', '61'), ('297', '2741'), ('297', '2617'), ('297', '2603'), ('297', '2502'), ('297', '2391'), ('297', '2381'), ('297', '2264'), ('297', '2251'), ('297', '2241'), ('297', '2018'), ('297', '1810'), ('297', '1697'), ('297', '1583'), ('297', '1551'), ('297', '1478'), ('297', '1241'), ('297', '1233'), ('297', '986'), ('297', '842'), ('297', '831'), ('297', '733'), ('297', '724'), ('297', '77'), ('296', '2839'), ('296', '2558'), ('296', '2234'), ('296', '2012'), ('296', '1885'), ('296', '1764'), ('296', '1731'), ('296', '1507'), ('296', '1377'), ('296', '1274'), ('296', '1619'), ('296', '1633'), ('296', '1116'), ('296', '1103'), ('296', '1094'), ('296', '974'), ('296', '815'), ('296', '679'), ('296', '445'), ('296', '427'), ('295', '2765'), ('295', '2286'), ('295', '2275'), ('295', '2057'), ('295', '1824'), ('295', '1688'), ('295', '1566'), ('295', '1288'), ('295', '1050'), ('295', '809'), ('295', '547'), ('295', '536'), ('295', '416'), ('295', '403'), ('295', '199'), ('294', '3124'), ('294', '2889'), ('294', '2751'), ('294', '2623'), ('294', '2342'), ('294', '2203'), ('294', '2064'), ('294', '1944'), ('287', '2784'), ('284', '1417'), ('287', '1301'), ('289', '3161'), ('289', '1976'), ('291', '2865'), ('294', '947'), ('289', '217'), ('285', '2498'), ('288', '2793'), ('289', '331'), ('285', '1819'), ('291', '2013'), ('293', '3043'), ('293', '1509'), ('292', '972'), ('289', '1837'), ('289', '2992'), ('283', '2568'), ('286', '1610'), ('292', '726'), ('292', '1593'), ('287', '2701'), ('288', '2151'), ('284', '2919'), ('289', '3145'), ('287', '2335'), ('290', '1001'), ('294', '725'), ('289', '747'), ('293', '730'), ('283', '1451'), ('284', '1820'), ('285', '1415'), ('287', '2570'), ('285', '1613'), ('287', '2869'), ('284', '2785'), ('290', '1360'), ('285', '3065'), ('293', '2391'), ('291', '2367'), ('288', '1388'), ('293', '1906'), ('284', '215'), ('293', '1089'), ('287', '1233'), ('286', '208'), ('291', '870'), ('284', '934'), ('290', '248'), ('291', '358'), ('287', '1840'), ('293', '855'), ('292', '1989'), ('294', '97'), ('285', '193'), ('288', '1116'), ('292', '2890'), ('293', '90'), ('289', '1288'), ('291', '1610'), ('290', '2599'), ('287', '1502'), ('294', '1088'), ('289', '1681'), ('292', '1480'), ('288', '2579'), ('293', '2372'), ('288', '611'), ('291', '99'), ('286', '2516'), ('291', '986'), ('290', '86'), ('283', '381'), ('285', '2763'), ('287', '487'), ('287', '883'), ('283', '3110'), ('287', '1308'), ('293', '596'), ('292', '1602'), ('293', '1245'), ('293', '2012'), ('293', '1644'), ('286', '1360'), ('288', '1889'), ('291', '228'), ('286', '2787'), ('285', '1489'), ('287', '226'), ('294', '1534'), ('292', '852'), ('286', '1218'), ('288', '3137'), ('290', '2464'), ('288', '2233'), ('291', '2359'), ('289', '2475'), ('293', '979'), ('287', '1848'), ('290', '524'), ('293', '1653'), ('290', '932'), ('283', '1469'), ('292', '2755'), ('286', '2308'), ('287', '622'), ('291', '1999'), ('287', '2414'), ('287', '1022'), ('285', '1059'), ('293', '2141'), ('287', '425'), ('289', '2254'), ('291', '1887'), ('293', '987'), ('287', '2691'), ('286', '2143'), ('289', '2857'), ('293', '1223'), ('292', '367'), ('288', '932'), ('285', '1338'), ('285', '2891'), ('294', '238'), ('293', '1501'), ('292', '1724'), ('286', '895'), ('293', '477'), ('290', '1767'), ('292', '1867'), ('292', '2901'), ('290', '659'), ('291', '2746'), ('289', '589'), ('289', '347'), ('286', '341'), ('291', '605'), ('287', '1972'), ('283', '2008'), ('283', '3244'), ('289', '210'), ('288', '2868'), ('286', '2000'), ('293', '2641'), ('288', '2569'), ('291', '1127'), ('284', '412'), ('292', '2880'), ('286', '2296'), ('286', '3056'), ('288', '2167'), ('288', '872'), ('285', '1193'), ('285', '992'), ('289', '2413'), ('287', '1435'), ('285', '2055'), ('292', '97'), ('286', '1149'), ('292', '1074'), ('291', '1238'), ('291', '1368'), ('290', '2849'), ('290', '2057'), ('288', '2458'), ('285', '2232'), ('286', '442'), ('288', '629'), ('290', '2455'), ('288', '1901'), ('287', '2114'), ('288', '2724'), ('289', '80'), ('284', '1689'), ('289', '3300'), ('292', '2874'), ('291', '2243'), ('292', '89'), ('287', '92'), ('293', '1367'), ('289', '2545'), ('290', '1633'), ('287', '762'), ('288', '2449'), ('292', '2771'), ('290', '2301'), ('290', '1510'), ('285', '1186'), ('283', '3102'), ('285', '785'), ('291', '736'), ('292', '237'), ('292', '2622'), ('290', '1906'), ('289', '2041'), ('285', '1987'), ('289', '2120'), ('290', '2476'), ('284', '1549'), ('294', '1671'), ('286', '2270'), ('287', '2391'), ('283', '2281'), ('286', '2981'), ('293', '2257'), ('287', '360'), ('283', '1800'), ('286', '2441'), ('289', '2849'), ('287', '2120'), ('289', '895'), ('292', '490'), ('288', '1622'), ('293', '217'), ('287', '236'), ('291', '350'), ('291', '1487'), ('287', '2917'), ('286', '944'), ('286', '821'), ('288', '745'), ('288', '222'), ('287', '2236'), ('293', '349'), ('292', '2388'), ('287', '628'), ('285', '386'), ('287', '2821'), ('284', '1828'), ('286', '954'), ('291', '1763'), ('292', '3017'), ('288', '351'), ('289', '454'), ('288', '1610'), ('287', '3116'), ('290', '719')]
    for count, vol_pg_tuple in enumerate(vol_pg_tuples):
        url = 'http://jama.jamanetwork.com/article.aspx?volume=%s&page=%s' % vol_pg_tuple
        try:
            sys.stdout.write("article # " + str(count) + " reading url...")
            start = time()
            r = br.open(url)
            entry_url = r.geturl()
            entry_html_source = r.read()
            soup = BeautifulSoup(entry_html_source.decode('utf-8'), 'html5lib')
            is_free = soup.find(class_='freeArticle')
            if is_free is None:
                sys.stdout.write(str(time()-start) + " seconds")
                sys.stdout.write("...skipping, article not free.\n")
                sys.stdout.flush()
            else:
                sys.stdout.write("adding to database...")
                # format of returned list from get_metadata function:
                # 0 identifier
                # 1 type
                # 2 language
                # 3 title
                # 4 date
                # 5 publisher
                # 6 author
                # 7 journal
                # 8 volume
                # 9 issue
                # 10 firstpage
                # 11 lastpage
                # 12 url
                res_metadata = parser.get_metadata(entry_url, entry_html_source)
                res_metadata[1] = 'JAMA review articles'
                res_identifier = res_metadata[0]
                # creates new Resource object and containing Subresource objects
                # creates Resource based on returned parser metadata
                res = Resource(identifier = res_metadata[0],
                    type = res_metadata[1],
                    language = res_metadata[2],
                    title = res_metadata[3],
                    date = res_metadata[4],
                    publisher = res_metadata[5],
                    author = res_metadata[6],
                    journal = res_metadata[7],
                    volume = res_metadata[8],
                    issue = res_metadata[9],
                    firstpage = res_metadata[10],
                    lastpage = res_metadata[11],
                    url = entry_url,
                    html_source = entry_html_source)
                res.save()
                res.user.add(9) # corresponds to [email protected]
                #res.user.add(2) # corresponds to [email protected]
                res.domain.add(1) # corresponds to Biomedical
                subres = []
                # creates Subresource objects of type 'figure'
                figures = parser.get_figures(entry_url, entry_html_source)
                for i, figure in enumerate(figures):
                    subres.append(Subresource(containing_resource = res,
                        name = figure[0].split('. ')[0],
                        type = 'figure',
                        content = u'. '.join(figure[0].split('. ')[1:]) + u'. ' + figure[1],
                        url = figure[4]))
                # creates Subresource objects of type 'paragraph'
                paragraphs = parser.get_paragraphs(entry_url, entry_html_source)
                for i, paragraph in enumerate(paragraphs):
                    subres.append(Subresource(containing_resource = res,
                        name = 'paragraph ' + str(i),
                        type = 'paragraph',
                        content = paragraph))
                subres_temp = Subresource.objects.bulk_create(subres)
                del subres_temp
                del subres
                sys.stdout.write(str(time()-start) + " seconds\n")
                sys.stdout.flush()
        except Exception, e:
            print "failed. exception: "+str(e)
            traceback.print_exc()
    '''
    #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

    sys.stdout.write("~~~~loading concepts and term lists... ")
    start = time()
    file = open('scripts\MESH_concept_and_terms_tuple.pkl', 'rb')
    (tot_concepts, concept_IDs, term_lists) = pickle_zloads(file.read())
    file.close()
    sys.stdout.write("%.2f" % (time() - start) + "seconds\n")
    sys.stdout.flush()

    res_ids = list(
        Resource.objects.filter(type="JAMA review articles").values_list(
            'id', flat=True))
    print "total # of resources: " + str(len(res_ids))
    for count, res_id in enumerate(res_ids):
        try:
            sys.stdout.write("article # " + str(count) + " processing...")
            start = time()
            target_paragraphs = Subresource.objects.filter(
                containing_resource_id=res_id)

            #create sentences from target_paragraphs
            sentences = []
            sentences_indexofparagraph = []
            tot_para = 0
            tot_sent = 0
            for para_num, target_paragraph in enumerate(target_paragraphs):
                #find all sentence in this paragraph
                tokenized_sentences = sent_tokenize(
                    target_paragraph.content.rstrip())
                sentences.extend(tokenized_sentences)
                sentences_indexofparagraph.extend([para_num] *
                                                  len(tokenized_sentences))
                tot_sent = tot_sent + len(tokenized_sentences)
                tot_para = tot_para + 1
            tot_para = len(target_paragraphs)

            #second go through each concept/term, find them in subresources, and process into matrix
            tc = 0
            j = 0
            row_sentence = []
            row_paragraph = []
            col_sentence = []
            col_paragraph = []
            data_sentence = []
            data_paragraph = []
            # initialize list of empty lists for storing concepts contained in each paragraph
            para_conceptIDs_contained = [[] for i in range(tot_para)]
            for i, con_ID in enumerate(concept_IDs):
                term_list = term_lists[i]
                wordcount_in_paragraphs = [0] * tot_para
                terms_regex = [
                    r"\b" + re2.escape(term.lower()) + r"\b"
                    for term in term_list
                ]
                search_pattern = re2.compile("|".join(terms_regex))
                for sent_num, sentence in enumerate(sentences):
                    wordcount = len(search_pattern.findall(sentence.lower()))
                    if wordcount > 0:  #only go ahead if search_pattern is in the sentence
                        row_sentence.append(sent_num)
                        col_sentence.append(tc)
                        data_sentence.append(1)
                        wordcount_in_paragraphs[
                            sentences_indexofparagraph[sent_num]] += wordcount
                for para_num in range(tot_para):
                    wordcount_in_p = wordcount_in_paragraphs[para_num]
                    if wordcount_in_p > 0:
                        row_paragraph.append(para_num)
                        col_paragraph.append(tc)
                        data_paragraph.append(1)
                        para_conceptIDs_contained[para_num].append(con_ID)
                if tc * 10 / tot_concepts > j:
                    percent_done = tc * 10 / tot_concepts * 10
                    sys.stdout.write(str(percent_done) + "% ")
                    j = j + 1
                tc = tc + 1

            # update concepts_contained fields for all subresource objects
            for para_num in range(tot_para):
                if len(para_conceptIDs_contained[para_num]) > 0:
                    target_paragraphs[para_num].concepts_contained.add(
                        *para_conceptIDs_contained[para_num])

            #create target_A matrix
            target_A_sentence = coo_matrix(
                (array(data_sentence),
                 (array(row_sentence), array(col_sentence))),
                shape=(tot_sent, tot_concepts),
                dtype=int16)
            #target_A_paragraph = coo_matrix((array(data_paragraph),(array(row_paragraph),array(col_paragraph))),shape=(tot_para,tot_concepts),dtype=int16)

            #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
            # now convert target_A into a scipy csr_matrix (sparse matrix)
            target_A_sentence = target_A_sentence.tocsr()
            #target_A_paragraph = target_A_paragraph.tocsr()

            # calculate AtA for target_A
            AtA_sentence = target_A_sentence.T * target_A_sentence
            #AtA_paragraph = target_A_paragraph.T * target_A_paragraph

            # add AtA to Big_A
            if count == 0:
                bigA_AtA_sentence = AtA_sentence
                N_sentence = tot_sent
                #bigA_AtA_paragraph = AtA_paragraph
                #N_paragraph = tot_para
            else:
                bigA_AtA_sentence = bigA_AtA_sentence + AtA_sentence
                N_sentence = N_sentence + tot_sent
                #bigA_AtA_paragraph = bigA_AtA_paragraph + AtA_paragraph
                #N_paragraph = N_paragraph + tot_para

            sys.stdout.write(str(time() - start) + " seconds\n")
            sys.stdout.flush()
        except Exception, e:
            print "failed. exception: " + str(e)
            traceback.print_exc()
Beispiel #15
0
    def handleEvent(self, event):
        eventName = event.eventType
        srcModuleName = event.module
        eventData = event.data

        if self.errorState:
            return None

        if self.opts['api_key'] == "":
            self.sf.error(
                "You enabled sfp_pastebin but did not set a Google API key!",
                False)
            self.errorState = True
            return None

        if eventData in self.results:
            return None
        else:
            self.results[eventData] = True

        for dom in self.domains.keys():
            links = list()
            target = self.domains[dom]
            res = self.sf.googleIterate(
                searchString="+site:{target_site} \"{search_keyword}\"".format(
                    target_site=target,
                    search_keyword=eventData,
                ),
                opts={
                    "timeout": self.opts["_fetchtimeout"],
                    "useragent": self.opts["_useragent"],
                    "api_key": self.opts["api_key"],
                    "cse_id": self.opts["cse_id"],
                },
            )

            if res is None:
                # Failed to talk to the Google API or no results returned
                return None

            urls = res["urls"]
            new_links = list(set(urls) - set(self.results.keys()))

            # Add new links to results
            for l in new_links:
                self.results[l] = True

            relevant_links = [
                link for link in new_links
                if self.sf.urlBaseUrl(link).endswith(target)
            ]

            for link in relevant_links:
                self.sf.debug("Found a link: " + link)

                if self.checkForStop():
                    return None

                res = self.sf.fetchUrl(link,
                                       timeout=self.opts['_fetchtimeout'],
                                       useragent=self.opts['_useragent'])

                if res['content'] is None:
                    self.sf.debug("Ignoring " + link + " as no data returned")
                    continue

                # Sometimes pastes search results false positives
                if re.search(
                        "[^a-zA-Z\-\_0-9]" + re.escape(eventData) +
                        "[^a-zA-Z\-\_0-9]", res['content'],
                        re.IGNORECASE) is None:
                    continue

                try:
                    startIndex = res['content'].index(eventData)
                except BaseException as e:
                    self.sf.debug("String not found in pastes content.")
                    continue

                evt1 = SpiderFootEvent("LEAKSITE_URL", link, self.__name__,
                                       event)
                self.notifyListeners(evt1)

                evt2 = SpiderFootEvent("LEAKSITE_CONTENT", res['content'],
                                       self.__name__, evt1)
                self.notifyListeners(evt2)
Beispiel #16
0
    def run(self):
        ip_indicators = [
            "204.93.183.196",
            "50.31.146.109",
            "5.135.208.53",
            "103.25.59.120",
            "50.97.99.2",
            "173.203.112.215",
            "27.124.127.10",
            "78.129.181.191",
            "204.197.254.94",
            "50.31.146.134",
        ]

        match_file = self.check_file(
            pattern=
            ".*\\\\Application\\ Data\\\\Microsoft\\\\[a-z]{3}(api32|audio|bios|boot|cap32|common|config|crypt|edit32|error|mgr32|serial|setup|share|sock|system|update|video|windows)\.exe$",
            regex=True,
            all=True)
        match_batch_file = self.check_file(
            pattern=".*\\\\Application\\ Data\\\\\d{1,10}\.bat$",
            regex=True,
            all=True)
        match_runkey = self.check_key(
            pattern=
            ".*\\\\Microsoft\\\\Windows\\\\CurrentVersion\\\\Run\\\\[a-z]{3}(api32|audio|bios|boot|cap32|common|config|crypt|edit32|error|mgr32|serial|setup|share|sock|system|update|video|windows)\.exe$",
            regex=True,
            all=True)
        match_otherkey = self.check_key(
            pattern=
            ".*\\\\Microsoft\\\\Office\\\\Common\\\\(?P<hex>[A-F0-9]+)\\\\(?P=hex)(CS|PS|SS|RS)",
            regex=True,
            all=True)
        match_mutex = self.check_mutex(pattern="^[A-F0-9]{1,8}(I|M|RM)$",
                                       regex=True,
                                       all=True)
        found_match_ip = False
        found_match_url = False
        if match_file:
            for match in match_file:
                self.data.append({"file": match})
        if match_batch_file:
            for match in match_batch_file:
                self.data.append({"batchfile": match})
        if match_runkey:
            for match in match_runkey:
                self.data.append({"runkey": match})
        if match_otherkey:
            for match in match_otherkey:
                self.data.append({"otherkey": match})
        if match_mutex:
            for match in match_mutex:
                self.data.append({"mutex": match})
        for ip_indicator in ip_indicators:
            match_ip = self.check_ip(pattern=ip_indicator)
            if match_ip:
                self.data.append({"ip": match_ip})
                found_match_ip = True
            match_url = self.check_url(pattern="http://" +
                                       re.escape(ip_indicator) +
                                       ":8080/[a-f0-9]{1,8}/[a-f0-9]{1,8}/",
                                       regex=True,
                                       all=True)
            if match_url:
                for match in match_url:
                    self.data.append({"url": match})
                found_match_url = True

        if match_file or match_batch_file or match_mutex or found_match_ip or found_match_url or match_runkey or match_otherkey:
            return True

        return False
Beispiel #17
0
    def handleEvent(self, event):
        eventName = event.eventType
        srcModuleName = event.module
        eventData = event.data

        if self.errorState:
            return None

        self.sf.debug("Received event, " + eventName + ", from " +
                      srcModuleName)

        if self.opts['api_key'] == "":
            self.sf.error(
                "You enabled sfp_onioncity but did not set a Google API key!",
                False)
            self.errorState = True
            return None

        if eventData in self.results:
            self.sf.debug("Already did a search for " + eventData +
                          ", skipping.")
            return None
        else:
            self.results[eventData] = True

        # Sites hosted on the domain
        res = self.sf.googleIterate(
            searchString="+site:onion.link " + eventData,
            opts={
                "timeout": self.opts["_fetchtimeout"],
                "useragent": self.opts["_useragent"],
                "api_key": self.opts["api_key"],
                "cse_id": self.opts["cse_id"],
            },
        )
        if res is None:
            # Failed to talk to the bing API or no results returned
            return None

        urls = res["urls"]
        new_links = list(set(urls) - set(self.results.keys()))

        # Add new links to results
        for l in new_links:
            self.results[l] = True

        # Submit the Google results for analysis
        googlesearch_url = res["webSearchUrl"]
        response = self.sf.fetchUrl(
            googlesearch_url,
            timeout=self.opts["_fetchtimeout"],
            useragent=self.opts["_useragent"],
        )
        if response['code'].startswith('2'):
            evt = SpiderFootEvent("RAW_RIR_DATA", response["content"],
                                  self.__name__, event)
            self.notifyListeners(evt)
        else:
            self.sf.error("Failed to fetch Google web search URL",
                          exception=False)

        # Check if we've been asked to stop
        if self.checkForStop():
            return None

        darknet_links = [
            link for link in new_links
            if self.sf.urlFQDN(link).endswith(".onion.link")
        ]

        for link in darknet_links:
            self.sf.debug("Found a darknet mention: " + link)
            torlink = link.replace(".onion.link", ".onion")
            if self.opts['fetchlinks']:
                res = self.sf.fetchUrl(torlink,
                                       timeout=self.opts['_fetchtimeout'],
                                       useragent=self.opts['_useragent'])

                if res['content'] is None:
                    self.sf.debug("Ignoring " + link + " as no data returned")
                    continue

                # Sometimes onion city search results false positives
                if re.search(
                        "[^a-zA-Z\-\_0-9]" + re.escape(eventData) +
                        "[^a-zA-Z\-\_0-9]", res['content'],
                        re.IGNORECASE) is None:
                    self.sf.debug("Ignoring " + link + " as no mention of " +
                                  eventData)
                    continue

                evt = SpiderFootEvent("DARKNET_MENTION_URL", torlink,
                                      self.__name__, event)
                self.notifyListeners(evt)

                try:
                    startIndex = res['content'].index(eventData) - 120
                    endIndex = startIndex + len(eventData) + 240
                except BaseException as e:
                    self.sf.debug("String not found in content.")
                    continue

                data = res['content'][startIndex:endIndex]
                evt = SpiderFootEvent("DARKNET_MENTION_CONTENT",
                                      "..." + data + "...", self.__name__, evt)
                self.notifyListeners(evt)
            else:
                evt = SpiderFootEvent("DARKNET_MENTION_URL", torlink,
                                      self.__name__, event)
                self.notifyListeners(evt)
Beispiel #18
0
	def do_command(self, e, cmd, nick, target, reply, dm):
		c = self.connection

		emoticontable = {
				':)': '☺',
# Some lines commented out due to lack of widespread font support
#				':D': '😃',
#				'^^': '😄',
#				'^_^':'😄',
#				':|': '😑',
				':(': '☹',
#				':/': '😕',
#				':\\':'😕',
#				'-.-':'😒',
#				':P' :'😛',
#				';P' :'😜',
#				'xP' :'😝',
#				';)' :'😉',
#				':?' :'😖',
#				'>:(':'😠',
#				'D:' :'😦',
#				':o' :'😯',
#				':O' :'😮',
#				'B)' :'😎'
				}
		for emoticon, uchar in emoticontable.items():
			if re.findall('(^|\W)'+re.escape(emoticon)+'(\W|$)', cmd) and random() < 0.333:
				reply('Did you mean {} (U+{:x}) with “{}”?'.format(uchar, ord(uchar), emoticon))
				break

		def replyopen():
			if self.lastopen:
				reply('Space was last marked {} by {} on {}.'.format(*self.lastopen))
			else:
				reply("I don't know when was the last time the space was open.")
		if cmd.startswith('open'):
			if '?' in cmd or '‽' in cmd:
				if cmd.count('?') >= 5:
					self.sendchan('afrabot: open?')
					return
				replyopen()
			else:
				if cmd.count('!') > 5:
					reply('u mad bro?')
					return
				self.set_open(True, nick)
			return
		if cmd.startswith('closed'):
			if '?' in cmd or '‽' in cmd:
				replyopen()
			else:
				if cmd.count('!') > 5:
					reply('u mad bro?')
					return
				dm('Please remember to follow the shutdown protocol.')
				self.set_open(False, nick)
			return
		if re.match('^ *genug +pleniert[.!]{,5}$', cmd) or re.match('^plenum[?!‽.]{,5}$', cmd):
			cs = self.chaossternchen
			if 'genug' in cmd:
				self.chaossternchen = []
				reply('Plenum beendet.')
			else:
				reply('Aye! So far, there are {} Chaos-☆'.format(len(cs)) + ('.' if len(cs) == 0 else ':'))
			for entry in enumerate(cs):
				reply('Chaos-☆ {}: {}'.format(*entry))
			return
		csmatch = re.match('^ *(delete|remove) +chaos-?([☆★☼☀*]|sternchen) *([0-9]+)[.!]{,5}$', cmd)
		if csmatch:
			try:
				num = int(csmatch.group(3))
				del self.chaossternchen[num]
				reply('Chaos-☆ {} deleted.'.format(num))
			except:
				reply('wut?')
			return
		if re.match('^help[?!‽.]*$', cmd):
			helptext = """open|closed? - query whether space is open
open|closed - set space open/closed
chaos*: [foobar] - add plenum topic
delete chaos* [num] - delete plenum topic number [n]
shutdown - list things to do when closing the space
plenum - list plenum topics
... and many more, doc urgently needed. Please submit PRs on github: https://github.com/afra/afrab0t
"""
			for line in helptext.splitlines():
				reply(line)
			return
		if re.match('^shutdown[?‽]*$', cmd):
			helptext = """* Fenster schließen (Beim rechten Fenster muss ein Hebel unten am Fenster betätigt werden. Bitte stellt sicher, dass beide Fenster dicht geschlossen sind.)
* Tische aufräumen und bei Bedarf kurz abwischen
* Geschirr spülen
* Kühlschrank auffüllen
* Heizung auf eine angemessene Stufe stellen (Winter: 2-3)
* Lampen, Computer, Boxen, Beamer, Kochplatte, Ofen, *Wasserkocher*, Laser abschalten
* Gucken, ob ralisi noch Geschirr abwäscht
* Müll mit runter nehmen
* Raum-, Aufgangs- und Haustür verschließen
"""
			for line in helptext.splitlines():
				reply(line)
			return
		if cmd == 'ponies?':
			reply('yes please!')
			return
		if re.match('^ *tell +afrab[o0]t +', cmd):
			reply('what is your problem?')
			return
		if cmd.rstrip('?') in ('where', 'location', 'wo'):
			reply('AfRA e.V. is located at Herzbergstr. 55, 10365 Berlin, 2.HH/Aufgang B, 3. floor on the'
					'left (Rm 3.08). Public transport: Tram M8, 21, 37 & Bus 256, N56, N50 → Herzbergstr./Siegfriedstr.'
					'Door closed? Try +49-176-29769254 !')
			return
		if cmd.rstrip('?') in ('tel', 'telefon', 'telephone', 'phone', 'handy', 'fon'):
			reply("Locked out? Wanna know what's up at AfRA? Try +49-176-29769254 !")
			return
		if cmd.rstrip('?!.') in ('cats', 'katzen', 'kittens', 'kätzchen'):
			try:
				submissions = self.reddit.get_subreddit('cats').get_hot(limit=50)
				index, item = next((i,s) for i,s in enumerate(submissions) if s.url not in self.catpiccache and not s.stickied and not s.is_self)
				self.catpiccache.append(item.url)
				if index != 5:
					reply('Got some cats for you: '+item.url)
				else:
					reply("Gee, you really like those cat things, don't you? You know, I could use some love, too: https://github.com/afra/afrab0t")
			except StopIteration:
				reply('The intertubes are empty.')
			return
		if cmd.rstrip('?!.') == 'catspam':
			def catspam():
				try:
					submissions = self.reddit.get_subreddit('cats').get_hot(limit=32)
					for s in submissions:
						if s.url not in self.nickcatpiccache[nick] and s.url not in self.catpiccache and not s.stickied and not s.is_self:
							self.nickcatpiccache[nick].append(s.url)
							dm(s.url)
							time.sleep(3)
				except Exception as e:
					log('Catspam problem:', e)
					reply('The intertubes are empty.')
			thr = Thread(target=catspam)
			thr.start()
			return
		if cmd.rstrip('?!.') in ('answer', 'antworte', 'antwort'):
			reply('42')
			return
		# ETA handling
		if cmd.rstrip('?') in ('etas', 'who', 'da'):
			with self.db as db:
				db.execute("DELETE FROM etas WHERE timestamp < DATETIME('now', '-1 day')")
			etas = ', '.join(nick+': '+eta for nick,eta in db.execute("SELECT nick, eta FROM etas").fetchall())
			if etas:
				reply('Current ETAs: '+etas)
			else:
				reply('No ETAs have been announced yet.')
			return
		# key handling
		keycmd = re.match('key ([\w]+) to ([\w]+)( *: *.*)?', cmd)
		if keycmd:
			with self.db as db:
				keystate, = db.execute("SELECT keystate FROM keylog ORDER BY timestamp DESC LIMIT 1").fetchone()
				keystatelist = keystate.split(', ')
				fromnick, tonick, comment = keycmd.groups()
				if not fromnick in keystatelist:
					reply('According to my information, as of now {} does not have a key. Current key'
							'holders are {}.'.format(fromnick, keystate))
					return
				keystatelist[keystatelist.index(fromnick)] = tonick
				keystate = ', '.join(keystatelist)
				db.execute("INSERT INTO keylog VALUES (DATETIME('now'),?,?,?,?)", (fromnick, tonick, keystate, comment))
				self.sendchan('Key transfer: {}→{}. Current key holders: {}'.format(fromnick, tonick, keystate))
			return
		if cmd.rstrip('?') == 'progress':
			t = datetime.datetime.now().time()
			p = 0
			if t.hour > 6 and t.hour < 18:
				p = ((t.hour-6)*3600+t.minute*60+t.second)/(3600*11)
			foo = round(67*p)
			bar = '='*foo
			space = ' '*(67-foo)
			reply('['+bar+'>'+space+'] ({:.2f}%)'.format(p*100))
			return
		if cmd.startswith('keystate '):
			keystate = re.split('[,;/: ]*', cmd)[1:]
			self.db.execute("INSERT INTO keylog VALUES (DATETIME('now'),'','',?,'')", (', '.join(keystate),))
			self.sendchan('Key status set. Current key holders: {}'.format(', '.join(keystate)))
			return
		keylog = re.match('keylog *([0-9]*)', cmd)
		if keylog:
			num = max(50, int(keylog.group(1) or 8))
			dm('The latest {} key log entries:'.format(num))
			loglines = self.db.execute("SELECT * FROM keylog ORDER BY timestamp DESC LIMIT ?", (num,))
			for timestamp, fromnick, tonick, keystate, comment in reversed(loglines):
				dm('{}: {}→{}; Key holders {}; Comment: "{}"'.format(
						timestamp, fromnick, tonick, keystate, comment))
			dm('EOL')
			return
		if cmd.startswith("f**k you"):
			reply('F*****g is entirely unnecessary: I can reproduce via copy-and-paste!')
			return
		if cmd.startswith("geh kacken"):
			reply('Command "kacken" not implemented. You are welcome to submit a pull request on github at https://github.com/afra/afrab0t')
			return
		# fall-through
		c.notice(nick, 'I don\'t know what you mean with "{}"'.format(cmd))
Beispiel #19
0
 def test_bug_612074(self):
     pat = u"[" + re.escape(u"\u2039") + u"]"
     self.assertEqual(re.compile(pat) and 1, 1)
def run():

    #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    '''start = time()
    # Browser
    br = mechanize.Browser()
    # Cookie Jar
    cj = cookielib.LWPCookieJar()
    br.set_cookiejar(cj)
    # Browser options
    br.set_handle_equiv(True)
    #br.set_handle_gzip(True)
    br.set_handle_redirect(True)
    br.set_handle_referer(True)
    br.set_handle_robots(False)
    # Follows refresh 0 but not hangs on refresh > 0
    br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1)
    # Want debugging messages?
    #br.set_debug_http(True)
    #br.set_debug_redirects(True)
    #br.set_debug_responses(True)
    # User-Agent (this is cheating, ok?)
    br.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1')]
    # Open some site, let's pick a random one, the first that pops in mind:
    br.add_password('https://www.nejm.org/sign-in', '*****', '*****')
    r = br.open('http://www.nejm.org/sign-in')
    br.select_form(nr=0)
    br.form['login']='******'
    br.form['password']='******'
    br.submit()
    print "initiated browser: " + str(time()-start) + " seconds"
    
    start = time()
    entry_urls = []
    for i in range(1, 38):
        html_url = 'http://www.nejm.org/medical-articles/review?page=' + str(i)
        r = br.open(html_url)
        html_source = r.read()
        soup = BeautifulSoup(html_source, 'html5lib')
        articleEntries = soup.find_all(class_='articleEntry')
        for entry in articleEntries:
            entry_urls.append('http://www.nejm.org' + entry.a['href'])
    print "obtained urls: " + str(time()-start) + " seconds"
    
    #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    
    for count, entry_url in enumerate(entry_urls):
        try:
            sys.stdout.write("article # " + str(count) + " adding to database...")
            start = time()
            r = br.open(entry_url)
            entry_html_source = r.read()
            # format of returned list from get_metadata function:
            # 0 identifier
            # 1 type
            # 2 language
            # 3 title
            # 4 date
            # 5 publisher
            # 6 author
            # 7 journal
            # 8 volume
            # 9 issue
            # 10 firstpage
            # 11 lastpage
            # 12 url
            res_metadata = parser.get_metadata(entry_url, entry_html_source)
            res_metadata[1] = 'NEJM review articles'
            res_identifier = res_metadata[0]
            # creates new Resource object and containing Subresource objects
            # creates Resource based on returned parser metadata
            res = Resource(identifier = res_metadata[0],
                type = res_metadata[1],
                language = res_metadata[2],
                title = res_metadata[3],
                date = res_metadata[4],
                publisher = res_metadata[5],
                author = res_metadata[6],
                journal = res_metadata[7],
                volume = res_metadata[8],
                issue = res_metadata[9],
                firstpage = res_metadata[10],
                lastpage = res_metadata[11],
                url = entry_url,
                html_source = entry_html_source)
            res.save()
            res.user.add(9) # corresponds to [email protected]
            #res.user.add(2) # corresponds to [email protected]
            res.domain.add(1) # corresponds to Biomedical
            subres = []
            # creates Subresource objects of type 'figure'
            figures = parser.get_figures(entry_url, entry_html_source)
            for i, figure in enumerate(figures):
                subres.append(Subresource(containing_resource = res,
                    name = figure[0],
                    type = 'figure',
                    content = figure[1],
                    url = figure[4]))
            # creates Subresource objects of type 'paragraph'
            paragraphs = parser.get_paragraphs(entry_url, entry_html_source)
            for i, paragraph in enumerate(paragraphs):
                subres.append(Subresource(containing_resource = res,
                    name = 'paragraph ' + str(i),
                    type = 'paragraph',
                    content = paragraph))
            subres_temp = Subresource.objects.bulk_create(subres)
            del subres_temp
            del subres
            sys.stdout.write(str(time()-start) + " seconds\n")
            sys.stdout.flush()
        except Exception, e:
            print "failed. exception: "+str(e)
            traceback.print_exc()'''

    #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

    sys.stdout.write("~~~~loading concepts and term lists... ")
    start = time()
    file = open('scripts\MESH_concept_and_terms_tuple.pkl', 'rb')
    (tot_concepts, concept_IDs, term_lists) = pickle_zloads(file.read())
    file.close()
    sys.stdout.write("%.2f" % (time() - start) + "seconds\n")
    sys.stdout.flush()

    res_ids = list(
        Resource.objects.filter(type="NEJM review articles").values_list(
            'id', flat=True))
    print "total # of resources: " + str(len(res_ids))
    for count, res_id in enumerate(res_ids):
        try:
            sys.stdout.write("article # " + str(count) + " processing...")
            start = time()
            target_paragraphs = Subresource.objects.filter(
                containing_resource_id=res_id)

            #create sentences from target_paragraphs
            sentences = []
            sentences_indexofparagraph = []
            tot_para = 0
            tot_sent = 0
            for para_num, target_paragraph in enumerate(target_paragraphs):
                #find all sentence in this paragraph
                tokenized_sentences = sent_tokenize(
                    target_paragraph.content.rstrip())
                sentences.extend(tokenized_sentences)
                sentences_indexofparagraph.extend([para_num] *
                                                  len(tokenized_sentences))
                tot_sent = tot_sent + len(tokenized_sentences)
                tot_para = tot_para + 1
            tot_para = len(target_paragraphs)

            #second go through each concept/term, find them in subresources, and process into matrix
            tc = 0
            j = 0
            row_sentence = []
            row_paragraph = []
            col_sentence = []
            col_paragraph = []
            data_sentence = []
            data_paragraph = []
            # initialize list of empty lists for storing concepts contained in each paragraph
            para_conceptIDs_contained = [[] for i in range(tot_para)]
            for i, con_ID in enumerate(concept_IDs):
                term_list = term_lists[i]
                wordcount_in_paragraphs = [0] * tot_para
                terms_regex = [
                    r"\b" + re2.escape(term.lower()) + r"\b"
                    for term in term_list
                ]
                search_pattern = re2.compile("|".join(terms_regex))
                for sent_num, sentence in enumerate(sentences):
                    wordcount = len(search_pattern.findall(sentence.lower()))
                    if wordcount > 0:  #only go ahead if search_pattern is in the sentence
                        row_sentence.append(sent_num)
                        col_sentence.append(tc)
                        data_sentence.append(1)
                        wordcount_in_paragraphs[
                            sentences_indexofparagraph[sent_num]] += wordcount
                for para_num in range(tot_para):
                    wordcount_in_p = wordcount_in_paragraphs[para_num]
                    if wordcount_in_p > 0:
                        row_paragraph.append(para_num)
                        col_paragraph.append(tc)
                        data_paragraph.append(1)
                        para_conceptIDs_contained[para_num].append(con_ID)
                if tc * 10 / tot_concepts > j:
                    percent_done = tc * 10 / tot_concepts * 10
                    sys.stdout.write(str(percent_done) + "% ")
                    j = j + 1
                tc = tc + 1

            # update concepts_contained fields for all subresource objects
            for para_num in range(tot_para):
                if len(para_conceptIDs_contained[para_num]) > 0:
                    target_paragraphs[para_num].concepts_contained.add(
                        *para_conceptIDs_contained[para_num])

            #create target_A matrix
            target_A_sentence = coo_matrix(
                (array(data_sentence),
                 (array(row_sentence), array(col_sentence))),
                shape=(tot_sent, tot_concepts),
                dtype=int16)
            #target_A_paragraph = coo_matrix((array(data_paragraph),(array(row_paragraph),array(col_paragraph))),shape=(tot_para,tot_concepts),dtype=int16)

            #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
            # now convert target_A into a scipy csr_matrix (sparse matrix)
            target_A_sentence = target_A_sentence.tocsr()
            #target_A_paragraph = target_A_paragraph.tocsr()

            # calculate AtA for target_A
            AtA_sentence = target_A_sentence.T * target_A_sentence
            #AtA_paragraph = target_A_paragraph.T * target_A_paragraph

            # add AtA to Big_A
            if count == 0:
                bigA_AtA_sentence = AtA_sentence
                N_sentence = tot_sent
                #bigA_AtA_paragraph = AtA_paragraph
                #N_paragraph = tot_para
            else:
                bigA_AtA_sentence = bigA_AtA_sentence + AtA_sentence
                N_sentence = N_sentence + tot_sent
                #bigA_AtA_paragraph = bigA_AtA_paragraph + AtA_paragraph
                #N_paragraph = N_paragraph + tot_para

            sys.stdout.write(str(time() - start) + " seconds\n")
            sys.stdout.flush()
        except Exception, e:
            print "failed. exception: " + str(e)
            traceback.print_exc()
Beispiel #21
0
 def on_trigger(self, message):
     """
     @type message: hubbot.message.IRCMessage
     """
     pointing_pattern = "^points at {}.+(kitty|kitteh)".format(re.escape(self.bot.nickname))
     if message.reply_to in self.bot.channels.keys():
         if "RoBoBo" not in self.bot.channels[message.reply_to].users.keys():
             if message.message_string.lower().startswith("meow"):
                 roll = hash((message.user.name, int(time.time()) / 3600, "meow")) % 20 + 1
                 if message.user.name == "BillTheCat":
                     return IRCResponse(ResponseType.SAY, "Uhm... Hi?", message.reply_to)
                 if message.user.name.startswith(
                         "Caitiri") or message.user.name == "Caity" or message.user.name.startswith("Heuf"):
                     if roll == 20:
                         return IRCResponse(ResponseType.DO,
                                            'points at {}, "CRITICAL KITTEH!"'.format(message.user.name),
                                            message.reply_to)
                     else:
                         return IRCResponse(ResponseType.DO,
                                            'points at {}, "KITTEH!"'.format(message.user.name),
                                            message.reply_to)
                 elif roll == 1:
                     reroll = hash((message.user.name, int(time.time()) / 3600, "meow", 42)) % 20 + 1
                     if reroll == 20:
                         return [IRCResponse(ResponseType.DO,
                                             'points at {}, "CRITICAL PUPPEH!"'.format(message.user.name),
                                             message.reply_to),
                                 IRCResponse(ResponseType.SAY,
                                             "Wait, what?",
                                             message.reply_to)]
                     else:
                         return IRCResponse(ResponseType.DO,
                                            'points at {}, "NOT KITTEH."'.format(message.user.name),
                                            message.reply_to)
                 elif (roll > 1) and (roll < 8):
                     return IRCResponse(ResponseType.DO,
                                        'points at {}, "NOT KITTEH."'.format(message.user.name),
                                        message.reply_to)
                 elif (roll > 7) and (roll < 14):
                     return IRCResponse(ResponseType.DO,
                                        'points at {}, "MEHBEH KITTEH?"'.format(message.user.name),
                                        message.reply_to)
                 elif (roll > 13) and (roll < 20):
                     return IRCResponse(ResponseType.DO,
                                        'points at {}, "KITTEH!"'.format(message.user.name),
                                        message.reply_to)
                 else:
                     return IRCResponse(ResponseType.DO,
                                        'points at {}, "CRITICAL KITTEH!"'.format(message.user.name),
                                        message.reply_to)
             elif message.message_string.lower().startswith("rawr"):
                 roll = hash((message.user.name, int(time.time()) / 3600, "rawr")) % 20 + 1
                 dragons = ["Itazu", "Trahsi", "reptile"]
                 if message.user.name in dragons:
                     return IRCResponse(ResponseType.SAY,
                                        "{} is a DRAGON!".format(message.user.name),
                                        message.reply_to)
                 elif roll == 1:
                     reroll = hash((message.user.name, int(time.time()) / 3600, "rawr", 42)) % 20 + 1
                     if reroll == 20:
                         return IRCResponse(ResponseType.SAY,
                                            "{} is SECRETLY A DRAGON!".format(message.user.name),
                                            message.reply_to)
                     else:
                         return IRCResponse(ResponseType.SAY,
                                            "{} is NOT a DINOSAUR.".format(message.user.name),
                                            message.reply_to)
                 elif (roll > 1) and (roll < 8):
                     return IRCResponse(ResponseType.SAY,
                                        "{} is NOT a DINOSAUR.".format(message.user.name),
                                        message.reply_to)
                 elif (roll > 7) and (roll < 14):
                     return IRCResponse(ResponseType.SAY,
                                        "{} MIGHT be a DINOSAUR.".format(message.user.name),
                                        message.reply_to)
                 elif (roll > 13) and (roll < 20):
                     return IRCResponse(ResponseType.SAY,
                                        "{} is a DINOSAUR.".format(message.user.name),
                                        message.reply_to)
                 else:
                     return IRCResponse(ResponseType.SAY,
                                        "{} is a CRITICAL DINOSAUR!".format(message.user.name),
                                        message.reply_to)
             elif message.type == "ACTION" and re.match(pointing_pattern, message.message_string, re.IGNORECASE):
                 return IRCResponse(ResponseType.SAY,
                                    "Curses, you've tumbled my nefarious plan!",
                                    message.reply_to)
Beispiel #22
0
import re2
import re
import timeit

def bump_num(matchobj):
    int_value = int(matchobj.group(0))
    return str(int_value + 1).encode('utf-8')


print(re2.sub(b'\\d+', bump_num, b'08.2 -2 23x99y'))
print(b'9.3 -3 24x100y')

s = b'\\1\\1'
print(re2.escape(s) == s)
print(re2.sub(b'(.)', re2.escape(s), b'x'))
print(re2.sub(b'(.)', re2.escape(s), b'x') == s)

import os.path as opath
path = opath.dirname(opath.abspath(__file__))
fn = opath.join(path, "tests", "genome.dat")
with open(fn, 'rb') as fd:
    genome = fd.read()

search = b"c[cg]cg[ag]g"
# search = b"cattctg"

re2_regex = re2.compile(search)
re_regex = re.compile(search)
def testre2():
    return re2_regex.findall(genome)
def testre():
Beispiel #23
0
def run():
	sys.stdout.write("~~~~loading concepts and term lists... ")
	start = time()
	file = open('scripts\MESH_concept_and_terms_tuple.pkl','rb')
	(tot_concepts, concept_IDs, term_lists) = pickle.loads(file.read())
	file.close()
	print "%.2f" % (time()-start), "seconds"
	
	for filenumber in [str(766-x) for x in range(20)]:
		print "FILENUM: " + filenumber
		row = []
		col = []
		data = []
		
		sys.stdout.write("~~~~parsing XML file... ")
		start = time()
		tree = ET.parse("..\..\PubMed\zip\medline13n0%s.xml" % filenumber)
		root = tree.getroot()
		citations = root.findall("MedlineCitation")
		sys.stdout.write("# citations: %d... " % len(citations))
		abstracts = []
		res_list = []
		for citation in citations:
			abstract_ET = citation.find("Article/Abstract")
			if abstract_ET is not None:
				abstract_textlist = []
				for t in abstract_ET.findall("AbstractText"):
					if t is not None:
						if t.text is not None:
							abstract_textlist.append(t.text)
				abstract = ' '.join(abstract_textlist)
				abstracts.append(abstract)
				res_tag = citation.find("PMID")
				if res_tag is None:
					url = ''
					identifier = ''
				else:
					identifier = res_tag.text
					url = "http://www.ncbi.nlm.nih.gov/pubmed/" + identifier
				res_tag = citation.find("Article/Language")
				if res_tag is None:
					language = ''
				else:
					language = res_tag.text[:2]
				res_tag = citation.find("Article/ArticleTitle")
				if res_tag is None:
					title = ''
				else:
					title = res_tag.text[:300]
				res_tag = citation.find("Article/Journal/JournalIssue/PubDate/Year")
				if res_tag is None:
					date = ''
				else:
					date = res_tag.text
				author_ET = citation.find("Article/AuthorList")
				if author_ET is not None:
					author_list = []
					for t in author_ET.getchildren():
						tt = t.find("LastName")
						if tt is not None:
							ttt = tt.find("Initials")
							if ttt is not None:
								author_list.append(tt.text+" "+ttt.text)
							else:
								author_list.append(tt.text)
					author = ', '.join(author_list)
					author = author[:767]	
				res_tag = citation.find("Article/Journal/ISOAbbreviation")
				if res_tag is None:
					journal = ''
				else:
					journal = res_tag.text[:50]
				res_tag = citation.find("Article/Journal/JournalIssue/Volume")
				if res_tag is None:
					volume = ''
				else:
					volume = res_tag.text
				res_tag = citation.find("Article/Journal/JournalIssue/Issue")
				if res_tag is None:
					issue = ''
				else:
					issue = res_tag.text
				res_tag = citation.find("Article/Pagination/MedlinePgn")
				if res_tag is None:
					firstpage = ''
				else:
					firstpage = res_tag.text.split('-')[0]
				res = Resource(identifier = identifier,
					type = "pubmed_abstract",
					language = language,
					title = title,
					date = date,
					publisher = '',
					author = author,
					journal = journal,
					volume = volume,
					issue = issue,
					firstpage = firstpage,
					lastpage = '',
					url = url,
					html_source = '')
				res_list.append(res)
		sys.stdout.write("# abstracts: %d... " % len(abstracts))
		print "%.2f" % (time()-start), "seconds"
		
		sys.stdout.write("~~~~crunching abstracts... ")
		start = time()
		abstract_conceptIDs_contained = [[] for i in range(len(abstracts))]
		for i, con_ID in enumerate(concept_IDs):
			if i % 1000 == 0:
				sys.stdout.write(str(int(i*100/tot_concepts)))
				sys.stdout.write("% ")
			term_list = term_lists[i]
			terms_regex = [r"\b"+re.escape(term.lower())+r"\b" for term in term_list]
			search_pattern = re.compile("|".join(terms_regex))	
		
			for abstract_num, abstract in enumerate(abstracts):
				wordcount = len(search_pattern.findall(abstract.lower()))
				if wordcount > 0:
					row.append(abstract_num)
					col.append(i)
					data.append(wordcount)
					abstract_conceptIDs_contained[abstract_num].append(con_ID)
		sys.stdout.write("... ")
		print "%.2f" % (time()-start), "seconds"	
		
		sys.stdout.write("~~~~saving file containing tuple of database object models... ")
		start = time()
		res_abstract_containedcon_tuplelist = []
		for abstract_num in range(len(abstracts)):
			res_abstract_containedcon_tuplelist.append((res_list[abstract_num], abstracts[abstract_num], abstract_conceptIDs_contained[abstract_num]))
		path = "scripts\\files_for_ec2\\res_abstract_containedcon_tuplelist_medline13n0%s.pkl" % filenumber
		file = open(path,'wb')
		file.write(pickle_zdumps(res_abstract_containedcon_tuplelist))
		file.close()
		print "%.2f" % (time()-start), "seconds"
		
		sys.stdout.write("~~~~creating target_A matrix... ")
		start = time()
		target_A = coo_matrix((array(data),(array(row),array(col))),shape=(len(abstracts),tot_concepts),dtype=int16)
		#now convert target_A into a scipy csr_matrix (sparse matrix)
		target_A = target_A.tocsr()
		path = "scripts\\pubmed_matrices\\rawA_medline13n0%s.pkl" % filenumber
		file = open(path,'wb')
		file.write(pickle_zdumps(target_A))
		file.close()
		print "%.2f" % (time()-start), "seconds"
	
	
	
	
	
	
	
	# Following is to be run on EC2 to reduce network latency #
	'''
Beispiel #24
0
 def convert_to_regex(pattern):
     ret = "^{}$".format(re.escape(pattern))  # fnmatch.translate(pattern)
     return ret.replace("\\?", ".").replace("\\*", ".*")
Beispiel #25
0
                if (t := self._match(r'[ \t]*')) and self._check(r'[^"\n]+'):
                    level = len(t.value)
                    if level > self.indent[-1]:
                        self.indent.append(level)
                        yield 'INDENT', None, m.pos
                    elif level not in self.indent:
                        self._lexical_error('inconsistent indentation')
                    elif level < self.indent[-1]:
                        while self.indent[-1] > level:
                            self.indent.pop()
                            yield 'DEDENT', None, m.pos

            return '__MANY', ('NEWLINE', None,
                              m.pos), *(compute_indent() or ())
        elif m := self._match('|'.join(
                re.escape(special) for special in (self.SPECIALS))):
            return m.value, m.value, m.pos
        elif m := self._match('\x00'):
            return 'EOF', None, m.pos
        elif self._match('[ \t\r]+'):
            return self._next()
        else:
            self._lexical_error('bad lexeme')

    def lex(self):
        while token := self._next():
            if token[0] == '__MANY':
                for entry in token[1:]:
                    yield Token(*entry)
            else:
                yield Token(*token)
Beispiel #26
0
 def test_bug_612074(self):
     pat=u"["+re.escape(u"\u2039")+u"]"
     self.assertEqual(re.compile(pat) and 1, 1)
 #second go through each concept/term, find them in subresources, and process into matrix
 tc = 0
 j = 0
 row_sentence = []
 row_paragraph = []
 col_sentence = []
 col_paragraph = []
 data_sentence = []
 data_paragraph = []
 # initialize list of empty lists for storing concepts contained in each paragraph
 para_conceptIDs_contained = [[] for i in range(tot_para)]
 for i, con_ID in enumerate(concept_IDs):
     term_list = term_lists[i]
     wordcount_in_paragraphs = [0] * tot_para
     terms_regex = [
         r"\b" + re2.escape(term.lower()) + r"\b"
         for term in term_list
     ]
     search_pattern = re2.compile("|".join(terms_regex))
     for sent_num, sentence in enumerate(sentences):
         wordcount = len(search_pattern.findall(sentence.lower()))
         if wordcount > 0:  #only go ahead if search_pattern is in the sentence
             row_sentence.append(sent_num)
             col_sentence.append(tc)
             data_sentence.append(1)
             wordcount_in_paragraphs[
                 sentences_indexofparagraph[sent_num]] += wordcount
     for para_num in range(tot_para):
         wordcount_in_p = wordcount_in_paragraphs[para_num]
         if wordcount_in_p > 0:
             row_paragraph.append(para_num)
Beispiel #28
0
def run():
    '''#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    start = time()
    # Browser
    br = mechanize.Browser()
    # Cookie Jar
    cj = cookielib.LWPCookieJar()
    br.set_cookiejar(cj)
    # Browser options
    br.set_handle_equiv(True)
    #br.set_handle_gzip(True)
    br.set_handle_redirect(True)
    br.set_handle_referer(True)
    br.set_handle_robots(False)
    # Follows refresh 0 but not hangs on refresh > 0
    br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1)
    # Want debugging messages?
    #br.set_debug_http(True)
    #br.set_debug_redirects(True)
    #br.set_debug_responses(True)
    # User-Agent
    br.addheaders = [('User-Agent', 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11')]
    print "initiated browser: " + str(time()-start) + " seconds"
    
    #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    
    # PMIDs and DOIs of Radiology review articles from 2003/01 to 2013/01
    identifiers = [{"pmid":12601205,"doi":"10.1148/radiol.2263011540"},{"pmid":12616012,"doi":"10.1148/radiol.2271001744"},{"pmid":12616015,"doi":"10.1148/radiol.2263020109"},{"pmid":12637675,"doi":"10.1148/radiol.2272011329"},{"pmid":12637677,"doi":"10.1148/radiol.2272012071"},{"pmid":12668742,"doi":"10.1148/radiol.2271010938"},{"pmid":12738874,"doi":"10.1148/radiol.2281020307"},{"pmid":12773666,"doi":"10.1148/radiol.2273011499"},{"pmid":12819343,"doi":"10.1148/radiol.2282011726"},{"pmid":12832569,"doi":"10.1148/radiol.2281020874"},{"pmid":12832573,"doi":"10.1148/radiol.2281021567"},{"pmid":12954885,"doi":"10.1148/radiol.2283030674"},{"pmid":12954888,"doi":"10.1148/radiol.2283021557"},{"pmid":14500855,"doi":"10.1148/radiol.2292030516"},{"pmid":14519867,"doi":"10.1148/radiol.2291020222"},{"pmid":14593188,"doi":"10.1148/radiol.2293010899"},{"pmid":14595138,"doi":"10.1148/radiol.2292020402"},{"pmid":14657300,"doi":"10.1148/radiol.2293031280"},{"pmid":14695382,"doi":"10.1148/radiol.2301031028"},{"pmid":14695386,"doi":"10.1148/radiol.2301021122"},{"pmid":14695395,"doi":"10.1148/radiol.2301021482"},{"pmid":14739312,"doi":"10.1148/radiol.2303021726"},{"pmid":14752175,"doi":"10.1148/radiol.2302031698"},{"pmid":14752178,"doi":"10.1148/radiol.2302021489"},{"pmid":14990813,"doi":"10.1148/radiol.2311020452"},{"pmid":15044750,"doi":"10.1148/radiol.2312021185"},{"pmid":15068942,"doi":"10.1148/radiol.2311021620"},{"pmid":15118110,"doi":"10.1148/radiol.2313021488"},{"pmid":15128979,"doi":"10.1148/radiol.2312032097"},{"pmid":15163803,"doi":"10.1148/radiol.2313040154"},{"pmid":15163813,"doi":"10.1148/radiol.2313030173"},{"pmid":15220490,"doi":"10.1148/radiol.2321021803"},{"pmid":15220491,"doi":"10.1148/radiol.2321030636"},{"pmid":15284429,"doi":"10.1148/radiol.2323031558"},{"pmid":15284433,"doi":"10.1148/radiol.2323030830"},{"pmid":15286305,"doi":"10.1148/radiol.2322021326"},{"pmid":15286311,"doi":"10.1148/radiol.2322040305"},{"pmid":15317956,"doi":"10.1148/radiol.2331020777"},{"pmid":15375227,"doi":"10.1148/radiol.2332031119"},{"pmid":15454614,"doi":"10.1148/radiol.2331041059"},{"pmid":15498896,"doi":"10.1148/radiol.2333031150"},{"pmid":15564389,"doi":"10.1148/radiol.2341031302"},{"pmid":15601895,"doi":"10.1148/radiol.2342031990"},{"pmid":15650038,"doi":"10.1148/radiol.2343030333"},{"pmid":15670993,"doi":"10.1148/radiol.2342030897"},{"pmid":15716389,"doi":"10.1148/radiol.2351031455"},{"pmid":15734922,"doi":"10.1148/radiol.2343041670"},{"pmid":15734925,"doi":"10.1148/radiol.2343031362"},{"pmid":15734929,"doi":"10.1148/radiol.2343031768"},{"pmid":15734940,"doi":"10.1148/radiol.2343030946"},{"pmid":15833981,"doi":"10.1148/radiol.2353040037"},{"pmid":15845798,"doi":"10.1148/radiol.2353042205"},{"pmid":15858079,"doi":"10.1148/radiol.2352040330"},{"pmid":15858080,"doi":"10.1148/radiol.2352040307"},{"pmid":15858081,"doi":"10.1148/radiol.2352040727"},{"pmid":15858087,"doi":"10.1148/radiol.2352040262"},{"pmid":15858096,"doi":"10.1148/radiol.2352032121"},{"pmid":15860674,"doi":"10.1148/radiol.2353040457"},{"pmid":15914473,"doi":"10.1148/radiol.2353041760"},{"pmid":15914474,"doi":"10.1148/radiol.2353041865"},{"pmid":15972340,"doi":"10.1148/radiol.2362040513"},{"pmid":15983074,"doi":"10.1148/radiol.2361041278"},{"pmid":15987959,"doi":"10.1148/radiol.2361041926"},{"pmid":15987960,"doi":"10.1148/radiol.2361031674"},{"pmid":16100082,"doi":"10.1148/radiol.2371040585"},{"pmid":16118165,"doi":"10.1148/radiol.2363041042"},{"pmid":16170017,"doi":"10.1148/radiol.2372050199"},{"pmid":16237143,"doi":"10.1148/radiol.2373041717"},{"pmid":16251391,"doi":"10.1148/radiol.2373040966"},{"pmid":16304103,"doi":"10.1148/radiol.2373050220"},{"pmid":16304111,"doi":"10.1148/radiol.2373050176"},{"pmid":16373757,"doi":"10.1148/radiol.2381041602"},{"pmid":16436808,"doi":"10.1148/radiol.2382051462"},{"pmid":16436809,"doi":"10.1148/radiol.2382041977"},{"pmid":16452394,"doi":"10.1148/radiol.2382050062"},{"pmid":16452395,"doi":"10.1148/radiol.2382050063"},{"pmid":16505391,"doi":"10.1148/radiol.2383041109"},{"pmid":16543592,"doi":"10.1148/radiol.2392050413"},{"pmid":16567481,"doi":"10.1148/radiol.2391041043"},{"pmid":16567482,"doi":"10.1148/radiol.2391050343"},{"pmid":16641348,"doi":"10.1148/radiol.2392052002"},{"pmid":16709793,"doi":"10.1148/radiol.2401050061"},{"pmid":16714455,"doi":"10.1148/radiol.2393042031"},{"pmid":16714456,"doi":"10.1148/radiol.2393050823"},{"pmid":16720868,"doi":"10.1148/radiol.2401050134"},{"pmid":16864664,"doi":"10.1148/radiol.2402050314"},{"pmid":16926320,"doi":"10.1148/radiol.2403050818"},{"pmid":16926321,"doi":"10.1148/radiol.2403050542"},{"pmid":16990669,"doi":"10.1148/radiol.2411050628"},{"pmid":17053199,"doi":"10.1148/radiol.2413051358"},{"pmid":17057062,"doi":"10.1148/radiol.2412060169"},{"pmid":17057063,"doi":"10.1148/radiol.2412041866"},{"pmid":17090716,"doi":"10.1148/radiol.2421052011"},{"pmid":17114619,"doi":"10.1148/radiol.2413051535"},{"pmid":17185659,"doi":"10.1148/radiol.2421052135"},{"pmid":17185660,"doi":"10.1148/radiol.2421051180"},{"pmid":17185662,"doi":"10.1148/radiol.2421050677"},{"pmid":17229874,"doi":"10.1148/radiol.2423051403"},{"pmid":17255408,"doi":"10.1148/radiol.2422051113"},{"pmid":17325062,"doi":"10.1148/radiol.2423051631"},{"pmid":17325078,"doi":"10.1148/radiol.2423041600"},{"pmid":17384237,"doi":"10.1148/radiol.2432050057"},{"pmid":17392247,"doi":"10.1148/radiol.2431030580"},{"pmid":17431128,"doi":"10.1148/radiol.2433060243"},{"pmid":17446526,"doi":"10.1148/radiol.2433061411"},{"pmid":17456864,"doi":"10.1148/radiol.2432060009"},{"pmid":17456865,"doi":"10.1148/radiol.2432060307"},{"pmid":17456883,"doi":"10.1148/radiol.2432030499"},{"pmid":17495176,"doi":"10.1148/radiol.2441052145"},{"pmid":17507723,"doi":"10.1148/radiol.2441060773"},{"pmid":17517922,"doi":"10.1148/radiol.2433070350"},{"pmid":17517924,"doi":"10.1148/radiol.2433060850"},{"pmid":17517925,"doi":"10.1148/radiol.2433051098"},{"pmid":17517926,"doi":"10.1148/radiol.2433051649"},{"pmid":17522346,"doi":"10.1148/radiol.2441051790"},{"pmid":17581895,"doi":"10.1148/radiol.2441051769"},{"pmid":17581896,"doi":"10.1148/radiol.2441060995"},{"pmid":17592037,"doi":"10.1148/radiol.2442060136"},{"pmid":17641360,"doi":"10.1148/radiol.2442051766"},{"pmid":17641361,"doi":"10.1148/radiol.2442051620"},{"pmid":17709823,"doi":"10.1148/radiol.2443060295"},{"pmid":17709824,"doi":"10.1148/radiol.2443051661"},{"pmid":17709825,"doi":"10.1148/radiol.2443060582"},{"pmid":17848679,"doi":"10.1148/radiol.2451061280"},{"pmid":17848685,"doi":"10.1148/radiol.2452070397"},{"pmid":17885179,"doi":"10.1148/radiol.2451060731"},{"pmid":17885180,"doi":"10.1148/radiol.2451051706"},{"pmid":17885181,"doi":"10.1148/radiol.2451051359"},{"pmid":17885185,"doi":"10.1148/radiol.2451061204"},{"pmid":17940297,"doi":"10.1148/radiol.2452061117"},{"pmid":17940298,"doi":"10.1148/radiol.2452061706"},{"pmid":17940300,"doi":"10.1148/radiol.2452060445"},{"pmid":17940301,"doi":"10.1148/radiol.2452061031"},{"pmid":18024448,"doi":"10.1148/radiol.2453060798"},{"pmid":18024449,"doi":"10.1148/radiol.2453061481"},{"pmid":18096524,"doi":"10.1148/radiol.2461061676"},{"pmid":18096526,"doi":"10.1148/radiol.2461061994"},{"pmid":18096527,"doi":"10.1148/radiol.2461061245"},{"pmid":18223119,"doi":"10.1148/radiol.2463061038"},{"pmid":18227534,"doi":"10.1148/radiol.2462071831"},{"pmid":18227535,"doi":"10.1148/radiol.2461070309"},{"pmid":18227536,"doi":"10.1148/radiol.2462061775"},{"pmid":18227540,"doi":"10.1148/radiol.2461070121"},{"pmid":18309012,"doi":"10.1148/radiol.2463060881"},{"pmid":18310461,"doi":"10.1148/radiol.2472061846"},{"pmid":18375837,"doi":"10.1148/radiol.2473061909"},{"pmid":18430871,"doi":"10.1148/radiol.2472061331"},{"pmid":18487532,"doi":"10.1148/radiol.2473062124"},{"pmid":18566164,"doi":"10.1148/radiol.2481080256"},{"pmid":18566166,"doi":"10.1148/radiol.2481072190"},{"pmid":18566168,"doi":"10.1148/radiol.2481071497"},{"pmid":18566169,"doi":"10.1148/radiol.2481060339"},{"pmid":18566177,"doi":"10.1148/radiol.2481071451"},{"pmid":18641243,"doi":"10.1148/radiol.2482070988"},{"pmid":18641245,"doi":"10.1148/radiol.2482062110"},{"pmid":18710972,"doi":"10.1148/radiol.2483070362"},{"pmid":18710973,"doi":"10.1148/radiol.2483062112"},{"pmid":18710974,"doi":"10.1148/radiol.2483071416"},{"pmid":18796665,"doi":"10.1148/radiol.2491070783"},{"pmid":18812557,"doi":"10.1148/radiol.2491071336"},{"pmid":18936309,"doi":"10.1148/radiol.2492071313"},{"pmid":19011181,"doi":"10.1148/radiol.2493070976"},{"pmid":19011184,"doi":"10.1148/radiol.2493080240"},{"pmid":19092089,"doi":"10.1148/radiol.2501071322"},{"pmid":19188309,"doi":"10.1148/radiol.2502081075"},{"pmid":19188310,"doi":"10.1148/radiol.2502071998"},{"pmid":19244037,"doi":"10.1148/radiol.2503080253"},{"pmid":19332844,"doi":"10.1148/radiol.2511071897"},{"pmid":19401568,"doi":"10.1148/radiol.2512080485"},{"pmid":19401569,"doi":"10.1148/radiol.2512081235"},{"pmid":19474372,"doi":"10.1148/radiol.2513080636"},{"pmid":19561247,"doi":"10.1148/radiol.2513081280"},{"pmid":19703877,"doi":"10.1148/radiol.2522082335"},{"pmid":19717748,"doi":"10.1148/radiol.2523081972"},{"pmid":19717750,"doi":"10.1148/radiol.2523081929"},{"pmid":19789250,"doi":"10.1148/radiol.2531090611"},{"pmid":19789251,"doi":"10.1148/radiol.2531090689"},{"pmid":19789254,"doi":"10.1148/radiol.2531090302"},{"pmid":19864525,"doi":"10.1148/radiol.2532081199"},{"pmid":19864526,"doi":"10.1148/radiol.2532081738"},{"pmid":12511664,"doi":"10.1148/radiol.2261021292"},{"pmid":12511666,"doi":"10.1148/radiol.2261011296"},{"pmid":12563122,"doi":"10.1148/radiol.2262011600"},{"pmid":12563154,"doi":"10.1148/radiol.2262011992"},{"pmid":19952025,"doi":"10.1148/radiol.2533090179"},{"pmid":20032141,"doi":"10.1148/radiol.2541090361"},{"pmid":20032142,"doi":"10.1148/radiol.09090021"},{"pmid":20032157,"doi":"10.1148/radiol.09090690"},{"pmid":20089722,"doi":"10.1148/radiol.09090552"},{"pmid":20093507,"doi":"10.1148/radiol.2542082312"},{"pmid":20177082,"doi":"10.1148/radiol.09091264"},{"pmid":20177083,"doi":"10.1148/radiol.09092100"},{"pmid":20177084,"doi":"10.1148/radiol.09090330"},{"pmid":20177086,"doi":"10.1148/radiol.09091324"},{"pmid":20308442,"doi":"10.1148/radiol.09090339"},{"pmid":20413748,"doi":"10.1148/radiol.10090105"},{"pmid":20501711,"doi":"10.1148/radiol.10090877"},{"pmid":20505067,"doi":"10.1148/radiol.10100213"},{"pmid":20574084,"doi":"10.1148/radiol.10090908"},{"pmid":20574087,"doi":"10.1148/radiol.10091938"},{"pmid":20634431,"doi":"10.1148/radiol.10091982"},{"pmid":20720066,"doi":"10.1148/radiol.10092307"},{"pmid":20720065,"doi":"10.1148/radiol.10090397"},{"pmid":20736332,"doi":"10.1148/radiol.10100570"},{"pmid":20829537,"doi":"10.1148/radiol.10100070"},{"pmid":20851933,"doi":"10.1148/radiol.10091298"},{"pmid":20851934,"doi":"10.1148/radiol.10091480"},{"pmid":20851938,"doi":"10.1148/radiol.10091210"},{"pmid":20935079,"doi":"10.1148/radiol.10092373"},{"pmid":20959547,"doi":"10.1148/radiol.10091269"},{"pmid":21084413,"doi":"10.1148/radiol.10100140"},{"pmid":21084414,"doi":"10.1148/radiol.10081490"},{"pmid":21163918,"doi":"10.1148/radiol.10101157"},{"pmid":21183492,"doi":"10.1148/radiol.10092129"},{"pmid":21273517,"doi":"10.1148/radiol.10100161"},{"pmid":21273518,"doi":"10.1148/radiol.10100116"},{"pmid":21273519,"doi":"10.1148/radiol.10081634"},{"pmid":21330566,"doi":"10.1148/radiol.11100569"},{"pmid":21339346,"doi":"10.1148/radiol.10100376"},{"pmid":21339345,"doi":"10.1148/radiol.10100025"},{"pmid":21415247,"doi":"10.1148/radiol.11101887"},{"pmid":21436096,"doi":"10.1148/radiol.11100155"},{"pmid":21502390,"doi":"10.1148/radiol.11090563"},{"pmid":21502391,"doi":"10.1148/radiol.11091276"},{"pmid":21586679,"doi":"10.1148/radiol.11101352"},{"pmid":21602502,"doi":"10.1148/radiol.11081489"},{"pmid":21602503,"doi":"10.1148/radiol.11101362"},{"pmid":21693659,"doi":"10.1148/radiol.11110333"},{"pmid":21778451,"doi":"10.1148/radiol.11101359"},{"pmid":21778450,"doi":"10.1148/radiol.11101104"},{"pmid":21803921,"doi":"10.1148/radiol.11101344"},{"pmid":21931140,"doi":"10.1148/radiol.11101688"},{"pmid":21931139,"doi":"10.1148/radiol.11101922"},{"pmid":21931141,"doi":"10.1148/radiol.11091822"},{"pmid":22012900,"doi":"10.1148/radiol.11111099"},{"pmid":22012903,"doi":"10.1148/radiol.11091882"},{"pmid":22012902,"doi":"10.1148/radiol.11101426"},{"pmid":22012904,"doi":"10.1148/radiol.11091207"},{"pmid":22012899,"doi":"10.1148/radiol.11111131"},{"pmid":22095994,"doi":"10.1148/radiol.11110474"},{"pmid":22095995,"doi":"10.1148/radiol.11091710"},{"pmid":22156992,"doi":"10.1148/radiol.11110423"},{"pmid":22190655,"doi":"10.1148/radiol.11101996"},{"pmid":22190656,"doi":"10.1148/radiol.11110144"},{"pmid":22357880,"doi":"10.1148/radiol.11110947"},{"pmid":22357881,"doi":"10.1148/radiol.11101384"},{"pmid":22438443,"doi":"10.1148/radiol.11111111"},{"pmid":22438439,"doi":"10.1148/radiol.12110462"},{"pmid":22438440,"doi":"10.1148/radiol.11101821"},{"pmid":22517953,"doi":"10.1148/radiol.12110446"},{"pmid":22517956,"doi":"10.1148/radiol.12111869"},{"pmid":22517954,"doi":"10.1148/radiol.12110433"},{"pmid":22517959,"doi":"10.1148/radiol.12111605"},{"pmid":22623691,"doi":"10.1148/radiol.12110526"},{"pmid":22623690,"doi":"10.1148/radiol.12102394"},{"pmid":22623696,"doi":"10.1148/radiol.12112114"},{"pmid":22692035,"doi":"10.1148/radiol.12112265"},{"pmid":22723560,"doi":"10.1148/radiol.12110772"},{"pmid":22723559,"doi":"10.1148/radiol.12110339"},{"pmid":22798223,"doi":"10.1148/radiol.12111561"},{"pmid":22821690,"doi":"10.1148/radiol.12112678"},{"pmid":22821695,"doi":"10.1148/radiol.12111703"},{"pmid":22821694,"doi":"10.1148/radiol.12111658"},{"pmid":22919038,"doi":"10.1148/radiol.12110810"},{"pmid":22919039,"doi":"10.1148/radiol.12110357"},{"pmid":22993219,"doi":"10.1148/radiol.12111270"},{"pmid":22993217,"doi":"10.1148/radiol.12111769"},{"pmid":22966066,"doi":"10.1148/radiol.12112201"},{"pmid":23093707,"doi":"10.1148/radiol.12111740"},{"pmid":23175542,"doi":"10.1148/radiol.12120354"},{"pmid":23264525,"doi":"10.1148/radiol.12112469"},{"pmid":23220901,"doi":"10.1148/radiol.12110853"},{"pmid":23070271,"doi":"10.1148/radiol.12120240"}]
    for count, ident in enumerate(identifiers):
        doi = ident['doi']
        url = 'http://pubs.rsna.org/doi/full/%s' % doi
        try:
            sys.stdout.write("article # " + str(count) + " reading url...")
            limitReached = True
            while True:
                if not limitReached:
                    break
                try:
                    start = time()
                    r = br.open(url)
                    limitReached = False
                except:
                    limitReached = True
                    sys.stdout.write("limit reached, waiting...")
                    sleep(3600)
            entry_url = r.geturl()
            entry_html_source = r.read()
            soup = BeautifulSoup(entry_html_source.decode('utf-8'), 'html5lib')
            is_not_free = soup.find(id='accessDenialWidget')
            if is_not_free is not None:
                sys.stdout.write(str(time()-start) + " seconds")
                sys.stdout.write("...skipping, article not free.\n")
                sys.stdout.flush()
            else:
                sys.stdout.write("adding to database...")
                # format of returned list from get_metadata function:
                # 0 identifier
                # 1 type
                # 2 language
                # 3 title
                # 4 date
                # 5 publisher
                # 6 author
                # 7 journal
                # 8 volume
                # 9 issue
                # 10 firstpage
                # 11 lastpage
                # 12 url
                res_metadata = parser.get_metadata(entry_url, entry_html_source)
                res_metadata[1] = 'Radiology'
                res_metadata[0] = doi
                # creates new Resource object and containing Subresource objects
                # creates Resource based on returned parser metadata
                res = Resource(identifier = res_metadata[0],
                    type = res_metadata[1],
                    language = res_metadata[2],
                    title = res_metadata[3],
                    date = res_metadata[4],
                    publisher = res_metadata[5],
                    author = res_metadata[6],
                    journal = res_metadata[7],
                    volume = res_metadata[8],
                    issue = res_metadata[9],
                    firstpage = res_metadata[10],
                    lastpage = res_metadata[11],
                    url = entry_url,
                    html_source = entry_html_source)
                res.save()
                res.user.add(9) # corresponds to [email protected]
                #res.user.add(2) # corresponds to [email protected]
                res.domain.add(1) # corresponds to Biomedical
                subres = []
                # creates Subresource objects of type 'figure'
                figures = parser.get_figures(entry_url, entry_html_source)
                for i, figure in enumerate(figures):
                    try:
                        f = urllib2.urlopen(urllib2.Request(figure[4]))
                        deadLinkFound = False
                    except:
                        deadLinkFound = True
                    if deadLinkFound:
                        url_correct = figure[3]
                    else:
                        url_correct = figure[4]
                    subres.append(Subresource(containing_resource = res,
                        name = figure[0],
                        type = 'figure',
                        content = figure[1],
                        url = url_correct))
                # creates Subresource objects of type 'paragraph'
                paragraphs = parser.get_paragraphs(entry_url, entry_html_source)
                for i, paragraph in enumerate(paragraphs):
                    subres.append(Subresource(containing_resource = res,
                        name = 'paragraph ' + str(i),
                        type = 'paragraph',
                        content = paragraph))
                subres_temp = Subresource.objects.bulk_create(subres)
                del subres_temp
                del subres
                sys.stdout.write(str(time()-start) + " seconds\n")
                sys.stdout.flush()
        except Exception, e:
            print "failed. exception: "+str(e)
            traceback.print_exc()'''

    #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

    sys.stdout.write("~~~~loading concepts and term lists... ")
    start = time()
    file = open('scripts\MESH_concept_and_terms_tuple.pkl', 'rb')
    (tot_concepts, concept_IDs, term_lists) = pickle_zloads(file.read())
    file.close()
    sys.stdout.write("%.2f" % (time() - start) + "seconds\n")
    sys.stdout.flush()

    res_ids = list(
        Resource.objects.filter(type="Radiology").values_list('id', flat=True))
    print "total # of resources: " + str(len(res_ids))
    for count, res_id in enumerate(res_ids):
        try:
            sys.stdout.write("article # " + str(count) + " processing...")
            start = time()
            target_paragraphs = Subresource.objects.filter(
                containing_resource_id=res_id)

            #create sentences from target_paragraphs
            sentences = []
            sentences_indexofparagraph = []
            tot_para = 0
            tot_sent = 0
            for para_num, target_paragraph in enumerate(target_paragraphs):
                #find all sentence in this paragraph
                tokenized_sentences = sent_tokenize(
                    target_paragraph.content.rstrip())
                sentences.extend(tokenized_sentences)
                sentences_indexofparagraph.extend([para_num] *
                                                  len(tokenized_sentences))
                tot_sent = tot_sent + len(tokenized_sentences)
                tot_para = tot_para + 1
            tot_para = len(target_paragraphs)

            #second go through each concept/term, find them in subresources, and process into matrix
            tc = 0
            j = 0
            row_sentence = []
            row_paragraph = []
            col_sentence = []
            col_paragraph = []
            data_sentence = []
            data_paragraph = []
            # initialize list of empty lists for storing concepts contained in each paragraph
            para_conceptIDs_contained = [[] for i in range(tot_para)]
            for i, con_ID in enumerate(concept_IDs):
                term_list = term_lists[i]
                wordcount_in_paragraphs = [0] * tot_para
                terms_regex = [
                    r"\b" + re2.escape(term.lower()) + r"\b"
                    for term in term_list
                ]
                search_pattern = re2.compile("|".join(terms_regex))
                for sent_num, sentence in enumerate(sentences):
                    wordcount = len(search_pattern.findall(sentence.lower()))
                    if wordcount > 0:  #only go ahead if search_pattern is in the sentence
                        row_sentence.append(sent_num)
                        col_sentence.append(tc)
                        data_sentence.append(1)
                        wordcount_in_paragraphs[
                            sentences_indexofparagraph[sent_num]] += wordcount
                for para_num in range(tot_para):
                    wordcount_in_p = wordcount_in_paragraphs[para_num]
                    if wordcount_in_p > 0:
                        row_paragraph.append(para_num)
                        col_paragraph.append(tc)
                        data_paragraph.append(1)
                        para_conceptIDs_contained[para_num].append(con_ID)
                if tc * 10 / tot_concepts > j:
                    percent_done = tc * 10 / tot_concepts * 10
                    sys.stdout.write(str(percent_done) + "% ")
                    j = j + 1
                tc = tc + 1

            # update concepts_contained fields for all subresource objects
            for para_num in range(tot_para):
                if len(para_conceptIDs_contained[para_num]) > 0:
                    target_paragraphs[para_num].concepts_contained.add(
                        *para_conceptIDs_contained[para_num])

            #create target_A matrix
            target_A_sentence = coo_matrix(
                (array(data_sentence),
                 (array(row_sentence), array(col_sentence))),
                shape=(tot_sent, tot_concepts),
                dtype=int16)
            #target_A_paragraph = coo_matrix((array(data_paragraph),(array(row_paragraph),array(col_paragraph))),shape=(tot_para,tot_concepts),dtype=int16)

            #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
            # now convert target_A into a scipy csr_matrix (sparse matrix)
            target_A_sentence = target_A_sentence.tocsr()
            #target_A_paragraph = target_A_paragraph.tocsr()

            # calculate AtA for target_A
            AtA_sentence = target_A_sentence.T * target_A_sentence
            #AtA_paragraph = target_A_paragraph.T * target_A_paragraph

            # add AtA to Big_A
            if count == 0:
                bigA_AtA_sentence = AtA_sentence
                N_sentence = tot_sent
                #bigA_AtA_paragraph = AtA_paragraph
                #N_paragraph = tot_para
            else:
                bigA_AtA_sentence = bigA_AtA_sentence + AtA_sentence
                N_sentence = N_sentence + tot_sent
                #bigA_AtA_paragraph = bigA_AtA_paragraph + AtA_paragraph
                #N_paragraph = N_paragraph + tot_para

            sys.stdout.write(str(time() - start) + " seconds\n")
            sys.stdout.flush()
        except Exception, e:
            print "failed. exception: " + str(e)
            traceback.print_exc()
Beispiel #29
0
 def test_escape(self, pattern, expected_escaped):
     escaped = re2.escape(pattern)
     self.assertEqual(expected_escaped, escaped)
Beispiel #30
0
 
 #second go through each concept/term, find them in subresources, and process into matrix
 tc = 0
 j = 0
 row_sentence = []
 row_paragraph = []
 col_sentence = []
 col_paragraph = []
 data_sentence = []
 data_paragraph = []
 # initialize list of empty lists for storing concepts contained in each paragraph
 para_conceptIDs_contained = [[] for i in range(tot_para)]
 for i, con_ID in enumerate(concept_IDs):
     term_list = term_lists[i]
     wordcount_in_paragraphs = [0] * tot_para
     terms_regex = [r"\b"+re2.escape(term.lower())+r"\b" for term in term_list]
     search_pattern = re2.compile("|".join(terms_regex))
     for sent_num, sentence in enumerate(sentences):
         wordcount = len(search_pattern.findall(sentence.lower()))
         if wordcount > 0: #only go ahead if search_pattern is in the sentence
             row_sentence.append(sent_num)
             col_sentence.append(tc)
             data_sentence.append(1)
             wordcount_in_paragraphs[sentences_indexofparagraph[sent_num]] += wordcount
     for para_num in range(tot_para):
         wordcount_in_p = wordcount_in_paragraphs[para_num]
         if wordcount_in_p > 0:
             row_paragraph.append(para_num)
             col_paragraph.append(tc)
             data_paragraph.append(1)
             para_conceptIDs_contained[para_num].append(con_ID)
Beispiel #31
0
	def __init__(self, batch_size, n_cpus, n_threads, mode):

		print('loading model...', end=' ')
		self.nlp = english_model.load()
		self.nlp.remove_pipe('tagger')
		self.nlp.remove_pipe('ner')
		
		punct = list(string.punctuation)
		punct.remove('.')
		punct.append('[**')
		punct.append('**]')
		punct = [re.escape(p) for p in punct]
		
		prefixes_custom = tuple(punct)
		infixes_custom = tuple(punct)
		suffixes_custom = tuple(punct)
		
		#prefixes_custom = tuple([r'\[\*\*', r'('])
		#suffixes_custom = tuple([r'\*\*\]', r')'])
		#infixes_custom = tuple([r'\[\*\*', r'\*\*\]', r'(', r')', r'>', r'<', r'->', r'-->', r'--->'])

		exceptions_custom = {id : pattern for id, pattern in tokenizer_utils.generate_matcher_pattern1()}		
		exceptions = update_exc(self.nlp.Defaults.tokenizer_exceptions, exceptions_custom)

		prefix_re = compile_prefix_regex(self.nlp.Defaults.prefixes + prefixes_custom)
		infix_re  = compile_infix_regex(infixes_custom + self.nlp.Defaults.infixes)
		suffix_re = compile_suffix_regex(self.nlp.Defaults.suffixes + suffixes_custom)
		
		tokenizer = SpacyTokenizer(self.nlp.vocab, rules=exceptions,
							prefix_search=prefix_re.search,
							suffix_search=suffix_re.search,
							infix_finditer=infix_re.finditer, token_match=self.nlp.Defaults.token_match)

		self.nlp.tokenizer = tokenizer

		matcher = Matcher(self.nlp.vocab)
						
		def on_match_pattern(matcher, doc, id, matches):
		
			match_id, start, end = matches[id]

			if self.nlp.vocab.strings[match_id].startswith('p3'):
				span = doc[start+1:end]
				span.merge()
				for i in range(id, len(matches)):
					matches[i] = (matches[i][0], matches[i][1] - 1,  matches[i][2] - 1)

			elif self.nlp.vocab.strings[match_id].startswith('p2.1'):
				span1 = doc[start:start+2]
				span2 = doc[start+2:end]
				span1.merge()
				span2.merge()
				for i in range(id, len(matches)):
					matches[i] = (matches[i][0], matches[i][1] - 2,  matches[i][2] - 2)

			elif self.nlp.vocab.strings[match_id].startswith('p2.2'):
				span2 = doc[start+1:end]
				span2.merge()
				for i in range(id, len(matches)):
					matches[i] = (matches[i][0], matches[i][1] - 1,  matches[i][2] - 1)

			elif self.nlp.vocab.strings[match_id].startswith('p2.3'):
				span1 = doc[start:start+2]
				span1.merge()
				for i in range(id, len(matches)):
					matches[i] = (matches[i][0], matches[i][1] - 1,  matches[i][2] - 1)
	
		for id, pattern in tokenizer_utils.generate_matcher_pattern2():
			matcher.add(id, on_match_pattern, pattern)
			
		for id, pattern in tokenizer_utils.generate_matcher_pattern3():
			matcher.add(id, on_match_pattern, pattern)
				
		self.nlp.add_pipe(matcher, before='parser')

		print('done')

		self.batch_size = batch_size
		self.n_cpus = n_cpus
		self.n_threads = n_threads
		self.mode = mode