def get_response_from_cache(self, service, raw_request, channel_item, channel_params, wsgi_environ, _loads=loads,
        _CachedResponse=_CachedResponse, _HashCtx=_HashCtx, _sha256=sha256, split_re=regex_compile('........?').findall):
        """ Returns a cached response for incoming request or None if there is nothing cached for it.
        By default, an incoming request's hash is calculated by sha256 over a concatenation of:
          * WSGI REQUEST_METHOD   # E.g. GET or POST
          * WSGI PATH_INFO        # E.g. /my/api
          * sorted(zato.http.GET) # E.g. ?foo=123&bar=456 (query string aka channel_params)
          * payload bytes         # E.g. '{"customer_id":"123"}' - a string object, before parsing
        Note that query string is sorted which means that ?foo=123&bar=456 is equal to ?bar=456&foo=123,
        that is, the order of parameters in query string does not matter.
        """
        if service.get_request_hash:
            hash_value = service.get_request_hash(_HashCtx(raw_request, channel_item, channel_params, wsgi_environ))
        else:
            query_string = str(sorted(channel_params.items()))
            data = '%s%s%s%s' % (wsgi_environ['REQUEST_METHOD'], wsgi_environ['PATH_INFO'], query_string, raw_request)
            hash_value = _sha256(data).hexdigest()
            hash_value = '-'.join(split_re(hash_value))

        # No matter if hash value is default or from service, always prefix it with channel's type and ID
        cache_key = 'http-channel-%s-%s' % (channel_item['id'], hash_value)

        # We have the key so now we can check if there is any matching response already stored in cache
        response = self.server.get_from_cache(channel_item['cache_type'], channel_item['cache_name'], cache_key)

        # If there is any response, we can now load into a format that our callers expect
        if response:
            response = _loads(response)
            response = _CachedResponse(response['payload'], response['content_type'], response['headers'],
                response['status_code'])

        return cache_key, response
Exemple #2
0
    def lists(self, pattern: str=None) -> List['WikiList']:
        """Return a list of WikiList objects.

        :param pattern: The starting pattern for list items.
            Return all types of lists (ol, ul, and dl) if pattern is None.
            If pattern is not None, it will be passed to the regex engine,
            remember to escape the `*` character. Examples:

                - `\#` means top-level ordered lists
                - `\#\*` means unordred lists inside an ordered one
                - Currently definition lists are not well supported, but you
                    can use `[:;]` as their pattern.

            Tips and tricks:

                Be careful when using the following patterns as they will
                probably cause malfunction in the `sublists` method of the
                resultant List. (However don't worry about them if you are
                not going to use the `sublists` method.)

                - Use `\*+` as a pattern and nested unordered lists will be
                    treated as flat.
                - Use `\*\s*` as pattern to rtstrip `items` of the list.

                Although the pattern parameter is optional, but specifying it
                can improve the performance.
        """
        lists = []
        lststr = self._lststr
        type_to_spans = self._type_to_spans
        spans = type_to_spans.setdefault('WikiList', [])
        spans_append = spans.append
        span_tuple_to_span_get = {(s[0], s[1]): s for s in spans}.get
        patterns = ('\#', '\*', '[:;]') if pattern is None \
            else (pattern,)  # type: Tuple[str, ...]
        for pattern in patterns:
            list_regex = regex_compile(
                LIST_PATTERN_FORMAT.replace(b'{pattern}', pattern.encode()),
                MULTILINE,
            )
            ss = self._span[0]
            for m in list_regex.finditer(self._shadow):
                ms, me = m.span()
                span = [ss + ms, ss + me]
                old_span = span_tuple_to_span_get((span[0], span[1]))
                if old_span is None:
                    spans_append(span)
                else:
                    span = old_span
                lists.append(
                    WikiList(
                        lststr, pattern, m, type_to_spans, span, 'WikiList'
                    )
                )
        return lists
Exemple #3
0
class Piston(object):
    OPTION_PRIORITY_ORDER = (
        ("phone_number", PhoneNumberMixin.PHONE_NUMBER_QUERY),
        ("ip_address", MaxmindMixin.IP_ADDRESS_QUERY),
        ("latitude_longitude", NominatimMixin.REVERSE_GEOCODE_QUERY),
        ("longitude_latitude", NominatimMixin.REVERSE_GEOCODE_QUERY),
        ("latitude", NominatimMixin.REVERSE_GEOCODE_QUERY),
        ("global", NominatimMixin.TYPICAL_GEOCODE_QUERY),
        ("subglobal", NominatimMixin.TYPICAL_GEOCODE_QUERY),
        ("local", NominatimMixin.TYPICAL_GEOCODE_QUERY),
        ("sublocal", NominatimMixin.TYPICAL_GEOCODE_QUERY),
        ("postcode", NominatimMixin.TYPICAL_GEOCODE_QUERY),
    )

    LEGAL_CONFIGURATION_OPTIONS = tuple(
        option
        for option, _ in OPTION_PRIORITY_ORDER) + ("longitude", "unknown")

    @property
    def state(self):
        return (self.HITS.value, self.MISS.value, self.CONT.value,
                self.CODE.value, self.FUZZ.value, self.NULL.value,
                self.FAIL.value)

    @property
    def processed(self):
        return self.__processed.value

    @classmethod
    def spark(cls,
              directory='/',
              client=Ellipsis,
              configuration=None,
              nominatim_host=None,
              **kwargs):
        country_geocode = None
        region_geocode = None
        phone_geocode = None

        if configuration is None:
            if nominatim_host is None:
                raise ValueError("Cannot run without a configuration and a "
                                 "known Nominatim host address!")
            for dirname, _dirpath, filenames in walk(directory):
                if country_geocode is None and 'cgeo.json.xz' in filenames:
                    country_geocode = join(dirname, 'cgeo.json.xz')
                if region_geocode is None and 'rgeo.json.xz' in filenames:
                    region_geocode = join(dirname, 'rgeo.json.xz')
                if phone_geocode is None and 'pgeo.json.xz' in filenames:
                    phone_geocode = join(dirname, 'pgeo.json.xz')
                if country_geocode and region_geocode and phone_geocode:
                    break
        else:
            nominatim_host = nominatim_host or configuration.getNominatimURI()
            country_geocode = configuration.getNominatimCountryGeoJSON()
            region_geocode = configuration.getNominatimRegionGeoJSON()
            phone_geocode = configuration.getNominatimPhoneGeoJSON()

        nominatim_host = urlparse(nominatim_host)
        nominatim_host = "%s://%s/nominatim/" % (nominatim_host.scheme or
                                                 'http', nominatim_host.netloc
                                                 or nominatim_host.path)

        if country_geocode and region_geocode:
            configuration = {
                '_country_geocode': country_geocode,
                '_region_geocode': region_geocode,
                'verbose': ENV.get(ENV.VERBOSE, as_type=int) > 2
            }
            if phone_geocode:
                configuration['_phone_geocode'] = phone_geocode
            kwargs.update(configuration)
            return cls(client, nominatim_host, **kwargs)
        raise ValueError("Cannot initialize geo.engine.Piston without "
                         "country and region geocode mappings.")

    @classmethod
    def generate_field_mapping(cls, config):
        mapping = {}

        try:
            gindex = config.params.geo_index
        except:
            gindex = {}

        for index_type in cls.LEGAL_CONFIGURATION_OPTIONS:
            for index_field in gindex.get(index_type, ()):
                mapping[index_field] = index_type

        return mapping

    NS = regex_compile(
        r'[\p{script=Han}\p{script=Tibetan}\p{script=Lao}'
        r'\p{script=Thai}\p{script=Khmer}]', regex_U)
    NS = frozenset(NS.findall(u''.join(unichr(i) for i in xrange(maxunicode))))
    # Hardcoded geo-string replacement values.
    HC = {}
    HC.update({  # Chinese main provinces
        province: province + u'\u7701'
        for province in {
            u'\u6cb3\u5317',
            u'\u5c71\u897f',
            u'\u8fbd\u5b81',
            u'\u5409\u6797',
            u'\u9ed1\u9f99\u6c5f',
            u'\u6c5f\u82cf',
            u'\u6d59\u6c5f',
            u'\u5b89\u5fbd',
            u'\u798f\u5efa',
            u'\u6c5f\u897f',
            u'\u6cb3\u5357',
            u'\u5c71\u4e1c',
            u'\u6e56\u5317',
            u'\u6e56\u5357',
            u'\u5e7f\u4e1c',
            u'\u6d77\u5357',
            u'\u56db\u5ddd',
            u'\u8d35\u5dde',
            u'\u4e91\u5357',
            u'\u7518\u8083',
            u'\u9752\u6d77',
            u'\u53f0\u6e7e',
        }
    })
    HC[u'\u9655\u897f'] = HC[u'\u9655\u897f\u7701'] = "Shaanxi"  # HATE
    HC[u'\u5c71\u897f'] = HC[u'\u5c71\u897f\u7701'] = "Shanxi"  # HAATE
    HC.update({  # Chinese major cities
        city: city + u'\u5e02'
        for city in {
            u'\u5317\u4eac',
            u'\u5929\u6d25',
            u'\u4e0a\u6d77',
            u'\u91cd\u5e86',
        }
    })
    HC.update({  # Chinese autonomous regions
        auto: auto + u'\u81ea\u6cbb\u533a'
        for auto in {
            u'\u5167\u8499\u53e4',
            u'\u5e7f\u897f\u58ee',
            u'\u897f\u85cf',
            u'\u5b81\u590f\u56de\u65cf',
            u'\u65b0\u7586\u7ef4\u543e\u5c14',
        }
    })
    HC.update({  # Chinese "Special" administrative regions
        spec: spec + u'\u7279\u522b\u884c\u653f\u533a'
        for spec in {
            # u'\u9999\u6e2f', # This is Hong Kong
            u'\u6fb3\u95e8',
        }
    })

    def remap_documents(self, document, mapping):
        information = {field_type: [] for field_type in mapping.values()}
        for field_name, field_type in mapping.iteritems():
            information[field_type].append(document.get(field_name))
        return self.remap_information(information)

    def remap_information(self, information):
        for field_type in information.iterkeys():
            information[field_type] = normalize(
                'NFKC', u' '.join(
                    filter(None,
                           (p.strip() if isinstance(p, basestring) else u''
                            for p in information[field_type]))))
            if self.NS.intersection(information[field_type]):
                information[field_type] = self.tokenizer(
                    'zh', information[field_type])
            information[field_type] = self.HC.get(information[field_type],
                                                  information[field_type])

        for field_type, search_type in self.OPTION_PRIORITY_ORDER:
            geo_lookup = getattr(self, '_' + field_type)(information)
            if isinstance(geo_lookup, dict):
                return search_type, geo_lookup

        return NominatimMixin.TYPICAL_GEOCODE_QUERY, self._unknown(information)

    LL = regex_compile(r'[^\+\-\.0-9]+')

    def _latitude_longitude(self, information):
        try:
            information['latitude'], information['longitude'] = map(
                float,
                filter(None,
                       self.LL.split(information['latitude_longitude']))[:2])
            return self._latitude(information)
        except:
            return None

    def _longitude_latitude(self, information):
        try:
            information['longitude'], information['latitude'] = map(
                float,
                filter(None,
                       self.LL.split(information['longitude_latitude']))[:2])
            return self._latitude(information)
        except:
            return None

    def _latitude(self, information):
        try:
            lat = '%010.5f' % float(information['latitude'])
            lon = '%010.5f' % float(information['longitude'])
            return {'lat': lat, 'lon': lon, 'orig': '%s, %s' % (lat, lon)}
        except:
            return None

    IA = regex_compile(r'[^0-9a-fA-F\:\.]', regex_U)

    def _ip_address(self, information):
        try:
            addresses = filter(
                None,
                sum([
                    self.IA.split(ia)
                    for ia in information['ip_address'].split()
                ], []))
            return {'ip_address': addresses, 'orig': ' / '.join(addresses)}
        except:
            return None

    def _phone_number(self, information):
        try:
            return {'phone_number': information['phone_number'].split()}
        except:
            return None

    def __clean_carry_and_extra(self, carry=None, extra=None):
        if isinstance(carry, Mapping):
            carry = dict(carry)
        else:
            carry = {}

        if isinstance(extra, basestring):
            extra = (extra, )
        elif isinstance(extra, Container):
            extra = tuple(extra)
        else:
            extra = tuple()

        return carry, extra

    CN = regex_compile(r'([\p{L}\p{N}]\P{Z}*[\p{L}\p{N}],?)', regex_U)

    def _global(self, information, carry=None, extra=None):
        try:
            carry, extra = self.__clean_carry_and_extra(carry, extra)
            global_ = u' '.join(match.group() for match in self.CN.finditer(
                information.get('global', u''))).strip()
            if global_:
                carry['country'] = global_
                extra = (global_, ) + extra
            return self._subglobal(information, carry, extra)
        except:
            return None

    def _subglobal(self, information, carry=None, extra=None):
        try:
            carry, extra = self.__clean_carry_and_extra(carry, extra)
            subglobal_ = u' '.join(match.group() for match in self.CN.finditer(
                information.get('subglobal', u''))).strip()
            if subglobal_:
                carry['state'] = subglobal_
                extra = (subglobal_, ) + extra
            return self._local(information, carry, extra)
        except:
            return None

    def _local(self, information, carry=None, extra=None):
        try:
            carry, extra = self.__clean_carry_and_extra(carry, extra)
            local_ = u' '.join(match.group() for match in self.CN.finditer(
                information.get('local', u''))).strip()
            if local_:
                carry['city'] = local_
                extra = (local_, ) + extra
            return self._sublocal(information, carry, extra)
        except:
            return None

    def _sublocal(self, information, carry=None, extra=None):
        try:
            carry, extra = self.__clean_carry_and_extra(carry, extra)
            sublocal_ = u' '.join(match.group() for match in self.CN.finditer(
                information.get('sublocal', u''))).strip()
            if sublocal_:
                carry['street'] = sublocal_
                extra = (sublocal_, ) + extra
            return self._postcode(information, carry, extra)
        except:
            return None

    def _postcode(self, information, carry=None, extra=None):
        try:
            carry, extra = self.__clean_carry_and_extra(carry, extra)
            postcode = u' '.join(match.group() for match in self.CN.finditer(
                information.get('postcode', u''))).strip()
            if postcode:
                carry['postalcode'] = postcode
            return self._unknown(information, carry, extra)
        except:
            return None

    def _unknown(self, information, carry=None, extra=None):
        try:
            carry, extra = self.__clean_carry_and_extra(carry, extra)
            unknown = u' '.join(match.group() for match in self.CN.finditer(
                information.get('unknown', u''))).strip()
            if unknown:
                extra = (unknown, ) + extra
            if extra:
                carry['q'] = u', '.join(
                    OrderedDict.fromkeys(filter(None, extra)))
                carry['orig'] = carry['q']
            if any(carry.values()):
                return carry
            else:
                return None
        except:
            return None

    @staticmethod
    def spawn_session(namespace, concurrency=4):
        session = Session()
        session.mount(prefix=namespace,
                      adapter=HTTPAdapter(pool_connections=concurrency,
                                          pool_maxsize=concurrency * 2,
                                          max_retries=0,
                                          pool_block=False))
        return session

    def __init__(self, client, nominatim_host, **kwargs):
        if isinstance(client, MongoClient):
            self.__client = client
        elif client is Ellipsis:
            warn("Ignoring invalid client -- cannot run jobs!", RuntimeWarning)
        else:
            raise TypeError("Nominatim must be started with a MongoClient!")

        self.__ns = kwargs['nominatim_host'] = nominatim_host
        warn("Geocoding against %s." % self.__ns, UserWarning)
        self.__map = CentroidUpdateHelper(**kwargs)
        self.__used = set()
        self.__cache = CacheDictionary(maxsize=kwargs.get('maxsize', 100000),
                                       weakref=False)
        self.__thread = kwargs.get('concurrent', 4)
        self.__session = self.spawn_session(namespace=self.__ns,
                                            concurrency=self.__thread)
        self.__processed = Value('i', 0, lock=False)
        self.__sleep = Value('f', 0.0, lock=True)
        self.tokenizer = LanguageTokenizer(concurrent=True)
        self.concurrent = kwargs.get('concurrent', 4)

    def session_fetch_function(self, url, **kwargs):
        return self.__session.get(url, timeout=5.0, **kwargs).content

    def restore_from_cache(self, subdomain, limit=None):
        if isinstance(limit, int):
            limit = int(
                max(min(limit, self.__cache.maxsize),
                    CacheDictionary.CACHE_SIZE_MIN // 100))
        else:
            limit = self.__cache.maxsize

        self.__cache = CacheDictionary(maxsize=self.__cache.maxsize,
                                       weakref=False)
        for doc in self.__client[subdomain][MC.CACHE_COL].find().sort(
                RO.LAST, DESCENDING).limit(limit):
            if 'value' in doc and doc['value']:
                search_query = doc.pop(RO.OBJECT_ID)
                self.__cache[CacheDictionary.gen_cache_key(
                    search_query)] = doc['value']

    def update_mongo_cache(self, subdomain):
        action = time() * 1000.0
        bulk = self.__client[subdomain][
            MC.CACHE_COL].initialize_unordered_bulk_op()
        for key in self.__used:
            if not key:
                continue
            _id = CacheDictionary.restore_cache_key(key)
            val = self.__cache.quiet_get(key)
            if val:
                bulk.find({
                    RO.OBJECT_ID: _id
                }).upsert().update({'$set': {
                    'value': val,
                    RO.LAST: action
                }})
        bulk.execute()

    def process(self, config, subdomain=None, pool_size=4, verbose=False):
        for _ in self.iterprocess(config, subdomain, pool_size, verbose):
            pass

    def _report_status_oneline(self, locked, runtime):
        stdout.write(
            "[% 9.3f] %d hits / %d misses / %d calls <--> "
            "%d coded (nom) / %d coded (idf) / %d empty <--> "
            "%04.2f sleep / %03.2f codes / %d total ~~ %d in iterlock.\r" %
            (time() - runtime, self.HITS.value, self.MISS.value,
             self.CONT.value, self.CODE.value, self.FUZZ.value,
             self.NULL.value, self.__sleep.value,
             float(self.HITS.value + self.MISS.value) / (time() - runtime),
             self.HITS.value + self.MISS.value + self.FAIL.value, locked))
        stdout.flush()

    def _report_status_compact(self, locked, runtime):
        stdout.write(
            "[% 9.3f] %d in iterlock (%d processed, %.3f per second)\n"
            "     Cache: %d hits / %d misses\n"
            "    Result: %d results -> %d codified\n"
            "   Network: %d calls (sleeping for %04.2f seconds)\n"
            " Timestamp: %s\n" % (
                time() - runtime,
                locked,
                self.HITS.value + self.MISS.value + self.FAIL.value,
                float(self.HITS.value + self.MISS.value) / (time() - runtime),
                self.HITS.value,
                self.MISS.value,
                self.CODE.value,
                self.FUZZ.value,
                self.CONT.value,
                self.__sleep.value,
                ctime(),
            ))
        stdout.flush()

    def report_status(self, locked, runtime):
        return self._report_status_compact(locked, runtime)

    def iterprocess(self, config, subdomain=None, pool_size=4, verbose=False):
        if verbose:
            runtime = time()
        self.__processed = Value('i', 0, lock=True)
        self.HITS = Value('i', 0, lock=True)
        self.MISS = Value('i', 0, lock=True)
        self.CONT = Value('i', 0, lock=True)
        self.CODE = Value('i', 0, lock=True)
        self.FUZZ = Value('i', 0, lock=True)
        self.NULL = Value('i', 0, lock=True)
        self.FAIL = Value('i', 0, lock=True)

        subdomain = config['mongo_db'] if subdomain is None else subdomain
        if subdomain is Ellipsis:
            pass
        elif subdomain:
            if verbose:
                stdout.write(
                    "[% 9.3f] Beginning retrieval of mongo cache...\n" %
                    (time() - runtime))
            self.restore_from_cache(subdomain=subdomain,
                                    limit=config.meta.counts.total)
            if verbose:
                stdout.write("[% 9.3f] Retrieval of mongo cache complete.\n" %
                             (time() - runtime))

        self._config = self.generate_field_mapping(config)
        pool = ThreadPool(min(max(int(pool_size), 1), self.concurrent))

        bulk = self.__client[config.mongo_db][
            config.mongo_table].initialize_unordered_bulk_op()
        locked = LockedIterator(
            self.__client[config.mongo_db][config.mongo_table].find(
                {}, projection={field: 1
                                for field in self._config}),
            lock_past=self.concurrent * 2150)

        if verbose:
            self.report_status(len(locked), runtime)
            last = time()

        for _id, geo, err in pool.imap_unordered(self.__process, locked):
            # for _id, geo, err in imap(self.__process, locked):
            locked -= 1
            self.__processed.value += 1
            if verbose and (time() - last) > 5.0:
                self.report_status(len(locked), runtime)
                last = time()
            if not _id or err is not None:
                stdout.write('\n%s\n' % err)
                yield err
                continue
            bulk.find({RO.OBJECT_ID: _id}).update_one({'$set': {DF.geo: geo}})
            yield None

        if verbose:
            self.report_status(len(locked), runtime)
            stdout.write("\n[% 9.3f] Geocoding complete.\n" %
                         (time() - runtime))

        pool.close()
        pool.join()

        if verbose:
            stdout.flush()
            stdout.write("[% 9.3f] Subthreads joined.\n" % (time() - runtime))

        bulk.execute()
        if verbose:
            stdout.write("[% 9.3f] Bulk insertion of results complete.\n" %
                         (time() - runtime))

        if subdomain is Ellipsis:
            pass
        elif subdomain:
            self.update_mongo_cache(subdomain=subdomain)
            if verbose:
                stdout.write(
                    "[% 9.3f] Bulk update of mongo cache complete.\n" %
                    (time() - runtime))
        yield None

    def iterprocess_streaming(self,
                              docs,
                              model,
                              subdomain=None,
                              pool_size=4,
                              verbose=False):
        if verbose:
            runtime = time()
        self.__processed = Value('i', 0, lock=True)
        self.HITS = Value('i', 0, lock=True)
        self.MISS = Value('i', 0, lock=True)
        self.CONT = Value('i', 0, lock=True)
        self.CODE = Value('i', 0, lock=True)
        self.FUZZ = Value('i', 0, lock=True)
        self.NULL = Value('i', 0, lock=True)
        self.FAIL = Value('i', 0, lock=True)

        if subdomain:
            if verbose:
                stdout.write(
                    "[% 9.3f] Beginning retrieval of mongo cache...\n" %
                    (time() - runtime))
            self.restore_from_cache(subdomain=subdomain, limit=10000)
            if verbose:
                stdout.write("[% 9.3f] Retrieval of mongo cache complete.\n" %
                             (time() - runtime))

        self._config = self.generate_field_mapping(model)
        pool = ThreadPool(min(max(int(pool_size), 1), self.concurrent))

        #bulk = self.__client[config.mongo_db][
        #    config.mongo_table].initialize_unordered_bulk_op()

        locked = LockedIterator(docs, lock_past=self.concurrent * 2150)

        if verbose:
            self.report_status(len(locked), runtime)
            last = time()

        for _id, geo, err in pool.imap_unordered(self.__process, locked):
            # for _id, geo, err in imap(self.__process, locked):
            locked -= 1
            self.__processed.value += 1
            if verbose and (time() - last) > 5.0:
                self.report_status(len(locked), runtime)
                last = time()
            if not _id or err is not None:
                stdout.write('\n%s\n' % err)
                #yield err
                continue
            yield (_id, geo)

        if verbose:
            self.report_status(len(locked), runtime)
            stdout.write("\n[% 9.3f] Geocoding complete.\n" %
                         (time() - runtime))

        pool.close()
        pool.join()

        if verbose:
            stdout.flush()
            stdout.write("[% 9.3f] Subthreads joined.\n" % (time() - runtime))

    def fire(self, query, type_=None):
        if type_ is None:
            type_, query = self.remap_information(query)
        return Stroke(host=self.__ns,
                      query=query,
                      assoc=self.__map,
                      cache=self.__cache,
                      search_type=type_,
                      verbose=True,
                      debug=False,
                      _fetch_function=self.session_fetch_function,
                      _catch_exceptions=(ConnectionError, Timeout),
                      _sleep=self.__sleep)

    def __process(self, dictionary):
        try:
            _id = dictionary[RO.OBJECT_ID]

            type_, query = self.remap_documents(dictionary, self._config)
            if isinstance(query, dict):
                geocode = self.fire(query, type_)

                if geocode.call_was_cached:
                    with self.HITS.get_lock():
                        self.HITS.value += 1
                    self.__used.add(geocode.cache_key)
                else:
                    with self.MISS.get_lock():
                        self.MISS.value += 1
                with self.CONT.get_lock():
                    self.CONT.value += geocode.calls

                if geocode.result:
                    result = geocode.result[0]
                else:
                    result = {}

                if result.get('id') > 0 or result.get('full'):
                    with self.CODE.get_lock():
                        self.CODE.value += 1

                if result.get('code', {}).get('country'):
                    with self.FUZZ.get_lock():
                        self.FUZZ.value += 1
                else:
                    with self.NULL.get_lock():
                        self.NULL.value += 1

                if query.get('orig'):
                    result['orig'] = query['orig']
            else:
                with self.FAIL.get_lock():
                    self.FAIL.value += 1
                result = {}

            return _id, result, None
        except:
            with self.FAIL.get_lock():
                self.FAIL.value += 1
            return None, {}, format_exc()
Exemple #4
0
from datetime import date
from urllib.parse import urlparse

from regex import compile as regex_compile
from requests import ConnectionError as RequestsConnectionError

from lib.commons import dict_to_sfn_cit_ref
from lib.urls import (
    urls_sfn_cit_ref, url2dict, get_home_title, get_html, find_authors,
    find_journal, find_site_name, find_title, ContentTypeError,
    ContentLengthError, StatusCodeError, TITLE_TAG
)


URL_FULLMATCH = regex_compile(
    r'https?+://web(?:-beta)?+\.archive\.org/(?:web/)?+'
    r'(\d{4})(\d{2})(\d{2})\d{6}(?>cs_|i(?>d_|m_)|js_)?+/(http.*)'
).fullmatch


def waybackmachine_sfn_cit_ref(
    archive_url: str, date_format: str = '%Y-%m-%d'
) -> tuple:
    """Create the response namedtuple."""
    m = URL_FULLMATCH(archive_url)
    if not m:
        # Could not parse the archive_url. Treat as an ordinary URL.
        return urls_sfn_cit_ref(archive_url, date_format)
    archive_year, archive_month, archive_day, original_url = \
        m.groups()
    original_dict = {}
    thread = Thread(
"""Define the ExternalLink class."""

from typing import Optional

from regex import compile as regex_compile

from ._spans import VALID_EXTLINK_CHARS
from ._wikitext import SubWikiText

URL_MATCH = regex_compile(VALID_EXTLINK_CHARS).match


class ExternalLink(SubWikiText):
    """Create a new ExternalLink object."""
    @property
    def url(self) -> str:
        """Return the url."""
        if self[0] == '[':
            return self[1:URL_MATCH(self._ext_link_shadow, 1).end()]
        return self.string

    @url.setter
    def url(self, newurl: str) -> None:
        """Set a new url."""
        if self[0] == '[':
            self[1:len('[' + self.url)] = newurl
        else:
            self[0:len(self.url)] = newurl

    @property
    def text(self) -> Optional[str]:
Exemple #6
0
"""Define the Argument class."""
from typing import Dict, List, MutableSequence, Optional, Union

from regex import compile as regex_compile, MULTILINE, DOTALL

from ._wikitext import SubWikiText, SECTION_HEADING

ARG_SHADOW_FULLMATCH = regex_compile(
    rb'[|:](?<pre_eq>(?:[^=]*+(?:' + SECTION_HEADING +
    rb'\n)?+)*+)(?:\Z|(?<eq>=)(?<post_eq>.*+))', MULTILINE | DOTALL).fullmatch


class Argument(SubWikiText):
    """Create a new Argument Object.

    Note that in MediaWiki documentation `arguments` are (also) called
    parameters. In this module the convention is:
    {{{parameter}}}, {{template|argument}}.
    See https://www.mediawiki.org/wiki/Help:Templates for more information.
    """

    __slots__ = '_shadow_match_cache', '_parent'

    def __init__(
        self,
        string: Union[str, MutableSequence[str]],
        _type_to_spans: Optional[Dict[str, List[List[int]]]] = None,
        _span: Optional[List[int]] = None,
        _type: Optional[Union[str, int]] = None,
        _parent: 'SubWikiTextWithArgs' = None,
    ):
Exemple #7
0
"""All things that are specifically related to adinebook website"""

from collections import defaultdict
from logging import getLogger
from typing import Optional

from langid import classify
from regex import compile as regex_compile
from requests import RequestException
from mechanicalsoup import StatefulBrowser

from lib.commons import first_last, dict_to_sfn_cit_ref, request, USER_AGENT,\
    LANG

ISBN_SEARCH = regex_compile(r'ISBN: </b> ([-\d]++)').search
DATE_SEARCH = regex_compile(
    r'تاریخ نشر:</b>(?<year>\d\d)/(?<month>\d\d)/(?<day>\d\d)').search
PUBLISHER_SEARCH = regex_compile(
    r'Publisher_ctl00_NameLabel" class="linkk">(.*?)</span>').search
VOLUME_SEARCH = regex_compile(r'\bجلد (\d+)').search
TITLE_SEARCH = regex_compile(r'BookTitle" class="h4">([^<]++)').search
AUTHORS_FINDALL = regex_compile(
    r'rptAuthor_ctl\d\d_NameLabel" class="linkk">([^>:]++):([^<]++)<').findall
LOCATION_SEARCH = regex_compile(r'محل نشر:</b>([^<]++)<').search


def ketabir_scr(url: str, date_format='%Y-%m-%d') -> tuple:
    """Return the response namedtuple."""
    dictionary = url2dictionary(url)
    dictionary['date_format'] = date_format
    if 'language' not in dictionary:
Exemple #8
0
from typing import Optional

from langid import classify
from regex import compile as regex_compile, DOTALL
from isbnlib import info as isbn_info

from config import LANG
from lib.ketabir import url2dictionary as ketabir_url2dictionary
from lib.ketabir import isbn2url as ketabir_isbn2url
from lib.bibtex import parse as bibtex_parse
from lib.commons import dict_to_sfn_cit_ref, request, ISBN13_SEARCH, \
    ISBN10_SEARCH
from lib.ris import ris_parse

OTTOBIB_SEARCH = regex_compile(
    '<textarea[^>]*+>(.*?)</textarea>',
    DOTALL,
).search

RM_DASH_SPACE = str.maketrans('', '', '- ')


class IsbnError(Exception):
    """Raise when bibliographic information is not available."""

    pass


def isbn_scr(isbn_container_str: str,
             pure: bool = False,
             date_format: str = '%Y-%m-%d') -> tuple:
    """Create the response namedtuple."""
Exemple #9
0
# TAG_CONTENTS = r'(?<contents>(?>(?!{TAG}).)*?)'.format(
#     TAG=TAG.format(**locals())
# )
# TAG_FINDITER = regex_compile(
#     TAG.format(**locals()), flags=DOTALL | VERBOSE
# ).finditer
# Note that the following regex won't check for nested tags
TAG_FULLMATCH = regex_compile(
    rb'''
    # Note that the start group does not include the > character
    <''' + ASCII_TAG_NAME + ATTR_PATTERN + rb'''*  # Todo: Possessive?
    # After the attributes, or after the tag name if there are no attributes,
    # there may be one or more space characters. This is sometimes required but
    # ignored here.
    (?<attr_insert>)
    [''' + SPACE_CHARS + rb''']*+
    (?>
        (?<self_closing>/\s*>)
        |>(?<contents>.*?)'''
    + END_TAG_PATTERN.replace(rb'{name}', rb'(?<end_name>(?P=name))')
    + rb'''|>  # only start; no end tag
    )''',
    DOTALL | VERBOSE,
).fullmatch


class SubWikiTextWithAttrs(SubWikiText):

    """Define a class for SubWikiText objects that have attributes.

    Any class that is going to inherit from SubWikiTextWithAttrs should provide
"""Define the Comment class."""
from typing import Dict, List, MutableSequence, Optional, Union

from regex import MULTILINE, compile as regex_compile

from ._wikitext import SubWikiText
from ._spans import COMMENT_PATTERN

COMMA_COMMENT = "'(?>" + COMMENT_PATTERN + ")*+"
COMMENT_COMMA = "(?>" + COMMENT_PATTERN + ")*+'"
BOLD_FULLMATCH = regex_compile(
    COMMA_COMMENT * 2 + "'(.*?)(?>'" + COMMENT_COMMA * 2 + "|$)",
    MULTILINE).fullmatch
ITALIC_FULLMATCH = regex_compile(COMMA_COMMENT + "'(.*?)(?>'" + COMMENT_COMMA +
                                 "|$)").fullmatch
ITALIC_NOEND_FULLMATCH = regex_compile(COMMA_COMMENT + "'(.*)").fullmatch


class Comment(SubWikiText):
    __slots__ = ()

    @property
    def contents(self) -> str:
        """Return contents of this comment."""
        return self(4, -3)

    @property
    def comments(self) -> List['Comment']:
        return []

Exemple #11
0
# jB_TO_NUM contains entries for both ی and ي
jB_TO_NUM = {
    'فروردین': 1,
    'اردیبهشت': 2,
    'خرداد': 3,
    'تیر': 4,
    'مرداد': 5,
    'شهریور': 6,
    'مهر': 7,
    'آبان': 8,
    'آذر': 9,
    'دی': 10,
    'بهمن': 11,
    'اسفند': 12}

DOUBLE_DIGIT_SEARCH = regex_compile(r'\d\d').search

# Date patterns:

# January|February...
B = (
    r'''
    (?<B>(?:J(?:anuary|u(?:ne|ly))
    |
    February
    |
    Ma(?:rch|y)
    |
    A(?:pril|ugust)
    |
    (?:(?:(?:Sept|Nov|Dec)em)|Octo)ber))
Exemple #12
0
RIS_FULLMATCH = regex_compile(
    r'''
    (?: # this  group matches any line
        ^
        (?>
            A[U\d]\ {2}-\ (?<author>.++)
            |DA\ {2}-\ \d++/(?<month>\d++).*+
            |EP\ {2}-\ (?<end_page>.++)
            |IS\ {2}-\ (?<issue>.++)
            |J[FA]\ {2}-\ (?<journal>.++)
            |LA\ {2}-\ (?<language>.++)
            |P(?>
                B\ {2}-\ (?<publisher>.++)
                |Y\ {2}-\ (?<year>\d++).*+
            )
            |S(?>
                N\ {2}-\ (?<isbn>\S*+).*+
                |P\ {2}-\ (?<start_page>.++)
            )
            |T(?>
                [1I]\ {2}-\ (?<title>.++)
                |3\ {2}-\ (?<series>.++)
                |Y\ {2}-\ (?<type>.++)
            )
            |UR\ {2}-\ (?<url>.++)
            |VL\ {2}-\ (?<volume>.++)
            |Y1\ {2}-\ (?<year>\d++).*+
            # any other line
            |[^\n]*+
        )
        \n
    )*
    ''',
    VERBOSE | MULTILINE,
).fullmatch
Exemple #13
0
"""Codes required to create English Wikipedia citation templates."""


from datetime import date as datetime_date
from functools import partial
from collections import defaultdict
from logging import getLogger

from regex import compile as regex_compile

from lib.language import TO_TWO_LETTER_CODE


# Includes ShortDOIs (See: http://shortdoi.org/) and
# https://www.crossref.org/display-guidelines/
DOI_URL_MATCH = regex_compile(r'https?://(dx\.)?doi\.org/').match

refless = partial(regex_compile(
    r'( \| ref=({{.*?}}|harv))(?P<repl> \| |}})'
).sub, r'\g<repl>')

TYPE_TO_CITE = {
    # BibTex types. Descriptions are from
    # http://ctan.um.ac.ir/biblio/bibtex/base/btxdoc.pdf
    # A part of a book, which may be a chapter (or section or whatever) and/or
    # a range of pages.
    'inbook': 'book',
    # A work that is printed and bound, but without a named publisher or
    # sponsoring institution.
    # Note: Yadkard does not currently support the `howpublished` option.
    'booklet': 'book',
Exemple #14
0
from langid import classify
from regex import compile as regex_compile, DOTALL

from config import LANG
from lib.ketabir import url2dictionary as ketabir_url2dictionary
from lib.ketabir import isbn2url as ketabir_isbn2url
from lib.bibtex import parse as bibtex_parse
from lib.commons import dict_to_sfn_cit_ref, request  # , Name
from lib.ris import parse as ris_parse


# original regex from:
# https://www.debuggex.com/r/0Npla56ipD5aeTr9
# https://www.debuggex.com/r/2s3Wld3CVCR1wKoZ
ISBN_10OR13_SEARCH = regex_compile(
    r'97[89]([ -]?+)(?=\d{1,5}\1?+\d{1,7}\1?+\d{1,6}\1?+\d)(?:\d\1*){9}\d'
    r'|(?=\d{1,5}([ -]?+)\d{1,7}\1?+\d{1,6}\1?+\d)(?:\d\1*+){9}[\dX]'
).search

ISBN10_SEARCH = regex_compile(
    r'(?=\d{1,5}([ -]?+)\d{1,7}\1?+\d{1,6}\1?+\d)(?:\d\1*+){9}[\dX]'
).search

ISBN13_SEARCH = regex_compile(
    r'97[89]([ -]?+)(?=\d{1,5}\1?+\d{1,7}\1?+\d{1,6}\1?+\d)(?:\d\1*+){9}\d'
).search


# original regex from: http://stackoverflow.com/a/14260708/2705757
# ISBN_REGEX = regex_compile(
#     r'(?=[-0-9 ]{17}|[-0-9X ]{13}|[0-9X]{10})(?:97[89][- ]?)'
#     r'?[0-9]{1,5}[- ]?(?:[0-9]+[- ]?){2}[0-9X]'
Exemple #15
0
from regex import compile as regex_compile, VERBOSE, IGNORECASE
from requests import Response as RequestsResponse
from requests.exceptions import RequestException

from lib.commons import (
    find_any_date, dict_to_sfn_cit_ref, ANYDATE_PATTERN,
    request)
from lib.urls_authors import find_authors, CONTENT_ATTR


MAX_RESPONSE_LENGTH = 2000000

# https://stackoverflow.com/questions/3458217/how-to-use-regular-expression-to-match-the-charset-string-in-html
CHARSET = regex_compile(
    rb'''
    <meta(?!\s*+(?>name|value)\s*+=)[^>]*?charset\s*+=[\s"']*+([^\s"'/>]*)
    ''',
    IGNORECASE | VERBOSE,
).search

TITLE_META_NAME_OR_PROP = r'''
    (?>name|property)=(?<q>["\'])
        (?>citation_title|title|Headline|og:title)
    (?P=q)
'''
TITLE_SEARCH = regex_compile(
    r'<meta\s++(?:'
    + TITLE_META_NAME_OR_PROP + r'\s++' + CONTENT_ATTR
    + '|'
    + CONTENT_ATTR + r'\s++' + TITLE_META_NAME_OR_PROP
    + ')'
    '|'
Exemple #16
0
CAPTION_MATCH = regex_compile(
    r"""
    # Everything until the caption line
    (?P<preattrs>
        # Start of table
        {\|
        (?:
            (?:
                (?!\n\s*+\|)
                [\s\S]
            )*?
        )
        # Start of caption line
        \n\s*+\|\+
    )
    # Optional caption attrs
    (?:
        (?P<attrs>[^\n|]*+)
        (?:\|)
        (?!\|)
    )?
    (?P<caption>.*?)
    # End of caption line
    (?:
        \n|
        \|\|
    )
    """,
    VERBOSE
).match
T = TypeVar('T')
Exemple #17
0
"""Define the Section class."""

from regex import compile as regex_compile

from ._wikitext import SubWikiText

HEADER_MATCH = regex_compile(rb'(={1,6})([^\n]+?)\1[ \t]*(\n|\Z)').match


class Section(SubWikiText):
    """Section class is used to represent page sections."""

    _header_match_cache = (None, None)

    @property
    def _header_match(self):
        cached_match, cached_shadow = self._header_match_cache
        shadow = self._shadow
        if cached_shadow == shadow:
            return cached_match
        m = HEADER_MATCH(shadow)
        self._header_match_cache = m, shadow
        return m

    @property
    def level(self) -> int:
        """The level of this section.

        getter: Return level which as an int in range(1,7) or 0 for the lead
            section.
        setter: Change the level.
Exemple #18
0
def test_end_tag_patterns():
    assert regex_compile(END_TAG_PATTERN.replace(
        b'{name}', b'p')).search(b'</p>').groupdict() == {
            'end_tag': b'</p>'
        }
Exemple #19
0
RIS_FULLMATCH = regex_compile(
    r'''
    (?: # this  group matches any line
        ^
        (?>
            A[U\d]\ {2}-\ (?<author>.++)
            |DA\ {2}-\ \d++/(?<month>\d++).*+
            |EP\ {2}-\ (?<end_page>.++)
            |IS\ {2}-\ (?<issue>.++)
            |J[FA]\ {2}-\ (?<journal>.++)
            |LA\ {2}-\ (?<language>.++)
            |P(?>
                B\ {2}-\ (?<publisher>.++)
                |Y\ {2}-\ (?<year>\d++).*+
            )
            |S(?>
                N\ {2}-\ (?<isbn>.++)
                |P\ {2}-\ (?<start_page>.++)
            )
            |T(?>
                [1I]\ {2}-\ (?<title>.++)
                |3\ {2}-\ (?<series>.++)
                |Y\ {2}-\ (?<type>.++)
            )
            |UR\ {2}-\ (?<url>.++)
            |VL\ {2}-\ (?<volume>.++)
            |Y1\ {2}-\ (?<year>\d++).*+
            # any other line
            |[^\n]*+
        )
        \n
    )*
    ''',
    VERBOSE | MULTILINE,
).fullmatch
Exemple #20
0
#! /usr/bin/python
# -*- coding: utf-8 -*-

"""Test urls_authors.BYLINE_PATTERN."""


from regex import compile as regex_compile, VERBOSE, IGNORECASE
from unittest import main, expectedFailure, TestCase

from lib.urls_authors import byline_to_names, BYLINE_PATTERN

BYLINE_PATTERN_REGEX = regex_compile(
    '^' + BYLINE_PATTERN + '$',
    IGNORECASE | VERBOSE
)


class RegexTest(TestCase):

    """BYLINE_PATTERN should pass the following tests."""

    def test_one_author(self):
        """http://www.defense.gov/News/NewsArticle.aspx?ID=18509"""
        text = 'By Jim Garamone'
        self.assertRegex(text, BYLINE_PATTERN_REGEX)

    def test_cap_names_joined_by_and(self):
        """Test two authors with and.

        Example:
        https://www.eff.org/deeplinks/2014/06/
Exemple #21
0
#! /usr/bin/python
# -*- coding: utf-8 -*-
"""Test urls_authors.BYLINE_PATTERN."""

from regex import compile as regex_compile, VERBOSE, IGNORECASE
import unittest

from src.urls_authors import byline_to_names, BYLINE_PATTERN

BYLINE_PATTERN_REGEX = regex_compile('^' + BYLINE_PATTERN + '$',
                                     IGNORECASE | VERBOSE)


class RegexTest(unittest.TestCase):
    """BYLINE_PATTERN should pass the following tests."""
    def test_one_author(self):
        """http://www.defense.gov/News/NewsArticle.aspx?ID=18509"""
        text = 'By Jim Garamone'
        self.assertRegex(text, BYLINE_PATTERN_REGEX)

    def test_cap_names_joined_by_and(self):
        """Test two authors with and.

        Example:
        https://www.eff.org/deeplinks/2014/06/
        sudan-tech-sanctions-harm-innovation-development-us-government-and-
        corporations-must-act

        Note the two consecutive spaces.

        """
Exemple #22
0
# -*- coding: utf-8 -*-

"""Codes specifically related to PubMed inputs."""

from collections import defaultdict
from config import NCBI_API_KEY, NCBI_EMAIL, NCBI_TOOL
from datetime import datetime
from logging import getLogger
from threading import Thread

from regex import compile as regex_compile

from lib.commons import dict_to_sfn_cit_ref, b_TO_NUM, request
from lib.doi import get_crossref_dict

NON_DIGITS_SUB = regex_compile(r'[^\d]').sub

NCBI_URL = (
    'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?'
    'api_key=' + NCBI_API_KEY + '&retmode=json&tool=' + NCBI_TOOL + '&email='
    + NCBI_EMAIL)
PUBMED_URL = NCBI_URL + '&db=pubmed&id='
PMC_URL = NCBI_URL + '&db=pmc&id='


class NCBIError(Exception):

    pass


def pmid_sfn_cit_ref(pmid: str, date_format='%Y-%m-%d') -> tuple:
Exemple #23
0
"""All things that are specifically related to adinebook website"""

from collections import defaultdict
from logging import getLogger
from typing import Optional

from langid import classify
from regex import compile as regex_compile
from requests import RequestException
from mechanicalsoup import StatefulBrowser

from lib.commons import first_last, dict_to_sfn_cit_ref, request, USER_AGENT,\
    LANG


ISBN_SEARCH = regex_compile(r'ISBN: </b> ([-\d]++)').search
DATE_SEARCH = regex_compile(
    r'تاریخ نشر:</b>(?<year>\d\d)/(?<month>\d\d)/(?<day>\d\d)').search
PUBLISHER_SEARCH = regex_compile(
    r'Publisher_ctl00_NameLabel" class="linkk">(.*?)</span>').search
VOLUME_SEARCH = regex_compile(r'\bجلد (\d+)').search
TITLE_SEARCH = regex_compile(r'BookTitle" class="h4">([^<]++)').search
AUTHORS_FINDALL = regex_compile(
    r'rptAuthor_ctl\d\d_NameLabel" class="linkk">([^>:]++):([^<]++)<').findall
LOCATION_SEARCH = regex_compile(r'محل نشر:</b>([^<]++)<').search


def ketabir_sfn_cit_ref(url: str, date_format='%Y-%m-%d') -> tuple:
    """Return the response namedtuple."""
    dictionary = url2dictionary(url)
    dictionary['date_format'] = date_format
Exemple #24
0
# Todo: can the tags method be implemented using a TAG_FINDITER? Will
# that be more performant?
# TAG_FINDITER should not find any tag containing other tags.
# TAG_CONTENTS = r'(?<contents>(?>(?!{TAG}).)*?)'.format(
#     TAG=TAG.format(**locals())
# )
# TAG_FINDITER = regex_compile(
#     TAG.format(**locals()), flags=DOTALL | VERBOSE
# ).finditer
# Note that the following regex won't check for nested tags
TAG_FULLMATCH = regex_compile(
    rb'''
    <(?<name>[A-Za-z0-9]++)''' + ATTRS_PATTERN + rb'''
    [''' + SPACE_CHARS + rb''']*+
    (?>
        (?<self_closing>/\s*>)
        |>(?<contents>.*)''' + END_TAG_PATTERN.replace(
        rb'{name}', rb'(?<end_name>[A-Za-z0-9]++)') +  # noqa
    rb'''|>  # only start; no end tag
    )''',
    DOTALL | VERBOSE).fullmatch


class SubWikiTextWithAttrs(SubWikiText):
    """Define a class for SubWikiText objects that have attributes.

    Any class that is going to inherit from SubWikiTextWithAttrs should provide
    _attrs_match property. Note that matching should be done on shadow.
    It's usually a good idea to cache the _attrs_match property.
    """
Exemple #25
0
class NominatimMixin(object):
    NS = regex_compile(r'[\p{script=Han}\p{script=Tibetan}\p{script=Lao}'
                       r'\p{script=Thai}\p{script=Khmer}]', regex_U)
    NS = frozenset(NS.findall(u''.join(unichr(i) for i in xrange(maxunicode))))
    # String geocode and reverse geocode operations provided by Nominatim
    TYPICAL_GEOCODE_QUERY = "geocode"
    TYPICAL_GEOCODE_SCRIPT = "search.php?"
    ATTEMPT_GEOCODE_ADJUST = regex_compile(r'[^\p{L}\p{N}\p{M},]', regex_U)
    CONSIDERATION_PRIORITY = (
        'street', 'postalcode', 'county', 'city', 'state', 'country')
    CONSIDERATION_ATTEMPTS = tuple(
        map(frozenset, (
            ('country', 'state', 'city', 'street'),
            ('country', 'state', 'city', 'county'),
            ('country', 'postalcode'),
            ('country', 'state', 'city',),
            ('state', 'city'),
            ('country', 'state'),
            ('country', 'city'),
            ('state', 'county'),
            ('country', 'street'),
            ('state', 'street'),
            ('city', 'street'),
            ('county', 'street'),
            ('postalcode', 'street'),
            ('country',),
            ('state',),
            ('postalcode',),
            ('city',),
            ('street',),
        )))
    BLACKLIST_PHRASES = frozenset([
        "other", "n/a", "none", "unknown", "nowhere", "null",
        u'\u6d77\u5916', u'\u5176\u4ed6', u'\u5176\u5b83'])
    __slots__ = ()  # This class simply stores methods, no __dict__ needed.

    @staticmethod
    def __urlencode_query(params):
        return urlencode(sorted([(k.encode('utf-8'), v.encode('utf-8'))
                                 for k, v in params.items()]))

    def run_geocode(self, query, errors):
        body = self.fxn(self._ns + query)
        try:
            return loads(body.strip())
        except:
            if "DB Error" in body:
                self.calls -= 1
                wait = self._sleep.value * 10
                if self._debug:
                    stdout.write(
                        "\nDetected PostgreSQL database "
                        "error. Sleeping for %.2f seconds."
                        "\nBad Request: %s\n" % (wait, query))
                if "DB Error" in errors:
                    self.calls += 1
                    errors.remove("DB Error")
                else:
                    errors.add("DB Error")
                sleep(wait)
            elif "Internal Server Error" in body:
                if self._debug:
                    stdout.write(
                        "\nDetected Nominatim internal error."
                        "\nBad Request: %s\n" % query)
                sleep(self._sleep.value)
            else:
                if self._debug:
                    stdout.write(
                        "\nEncountered unknown error.\n%s\n%s"
                        "\nBad Request: %s\n"
                        % (body, format_exc(), query))
                sleep(self._sleep.value)
            raise NominatimResponseError(
                query, body, "Response was not legal JSON.")

    @classmethod
    def get_geocode(cls, query, attempt=0, juggle=False):
        for value in query.itervalues():
            substrings = list(filter(None, value.lower().strip().split()))
            if len(substrings) > 2:
                continue
            for substring in substrings:
                for ignore in cls.BLACKLIST_PHRASES:
                    if substring.startswith(ignore):
                        return None

        params = {field: query[field] for field in cls.CONSIDERATION_PRIORITY
                  if field in query and query[field]}
        if params:
            for keyset in cls.CONSIDERATION_ATTEMPTS:
                if keyset.difference(params):
                    continue
                if attempt > 0:
                    attempt -= 1
                else:
                    return {field: params[field] for field in keyset}

        if any(cls.NS.intersection(value) for value in query.itervalues()):
            left2right = 0
        else:
            left2right = 1

        if attempt > 0:
            if juggle:
                attempt += 1
                if attempt % 2 == left2right:
                    cut = slice(int(attempt // 2), None, None)
                else:
                    cut = slice(None, -int(attempt // 2), None)
            else:
                if left2right:
                    cut = slice(int(attempt), None, None)
                else:
                    cut = slice(None, -int(attempt), None)
        else:
            cut = slice(None, None, None)

        if 'q' not in query or not query['q'].strip():
            query['q'] = u', '.join(
                query[field].strip() for field in cls.CONSIDERATION_PRIORITY
                if field in query and query[field].strip())
        query_input = query['q']

        if u',' in query_input:
            split_input = u','.join(query_input.split(u',')[cut]).split()
        else:
            split_input = cls.ATTEMPT_GEOCODE_ADJUST.split(query_input)[cut]

        if split_input:
            return {"q": u' '.join(split_input).rstrip(u',')}
        else:
            return None

    def res_geocode(self, query, errors):
        query.update(self.arguments)
        return self.run_geocode(self.TYPICAL_GEOCODE_SCRIPT +
                                self.__urlencode_query(query), errors)

    REVERSE_GEOCODE_QUERY = "reverse"
    REVERSE_GEOCODE_SCRIPT = "reverse.php?"

    @classmethod
    def get_reverse(cls, query, attempt=0):
        # For an explanation of recorded accuracy see:
        # https://en.wikipedia.org/wiki/Decimal_degrees#Precision
        if attempt == 0:
            return {"lat": query["lat"], "lon": query["lon"]}
        else:
            return None

    def res_reverse(self, query, errors):
        query.update(self.arguments)
        return self.run_geocode(self.REVERSE_GEOCODE_SCRIPT +
                                self.__urlencode_query(query), errors)
Exemple #26
0
PM_PF_TL_FINDITER = regex_compile(
    rb'\{\{'
    rb'(?>'
    # param
    rb'\{(?>[^{}]*+|}(?!})|{(?!{))*+\}\}\}()'
    rb'|'
    # parser function
    rb'\s*+'
    # generated pattern: _config.regex_pattern(_config._parser_functions)
    # with \#[^{}\s:]++ added manually.
    rb'(?>\#[^{}\s:]++|u(?>rlencode|c(?:first)?+)|s(?>ubst|afesubst)|raw|p(?>l'
    rb'ural|ad(?>right|left))|nse?+|msg(?:nw)?+|l(?>ocalurl|c(?:first)?+)|int|'
    rb'g(?>rammar|ender)|f(?>ullurl|ormatnum|ilepath)|canonicalurl|anchorencod'
    rb'e|TALK(?>SPACEE?+|PAGENAMEE?+)|SUB(?>PAGENAMEE?+|JECT(?>SPACEE?+|PAGENA'
    rb'MEE?+))|R(?>OOTPAGENAMEE?+|EVISION(?>YEAR|USER|TIMESTAMP|MONTH1?+|ID|DA'
    rb'Y2?+))|P(?>ROTECTION(?>LEVEL|EXPIRY)|AGE(?>SI(?>ZE|N(?>N(?>S|AMESPACE)|'
    rb'CAT(?:EGORY)?+))|NAMEE?+|ID))|N(?>UM(?>INGROUP|BER(?>OF(?>VIEWS|USERS|P'
    rb'AGES|FILES|EDITS|A(?>RTICLES|DMINS|CTIVEUSERS))|INGROUP))|AMESPACE(?>NU'
    rb'MBER|E)?+)|FULLPAGENAMEE?+|D(?>ISPLAYTITLE|EFAULT(?>SORT(?:KEY)?+|CATEG'
    rb'ORYSORT))|CASCADINGSOURCES|BASEPAGENAMEE?+|ARTICLE(?>SPACEE?+|PAGENAMEE'
    rb'?+))'
    # end of generated part
    rb':(?>[^{}]*+|}(?!})|{(?!{))*+\}\}()'
    rb'|'
    # invalid template name
    rb'[\s_]*+'  # invalid name
    rb'(?:\|(?>[^{}]++|{(?!{)|}(?!}))*+)?+'  # args
    rb'\}\}()'
    rb'|'
    # template
    rb'\s*+' + VALID_TITLE_CHARS_PATTERN +  # template name
    rb'\s*+'
    rb'(?:\|(?>[^{}]++|{(?!{)|}(?!}))*+)?+'  # args
    rb'\}\}'
    rb')').finditer
Exemple #27
0
        ,?\ ++and\ {NAME_PATTERN}(
            ,\ {NAME_PATTERN}(
                ,\ {NAME_PATTERN}
                |
                ,?\ ++and\ {NAME_PATTERN}
            )?
            |
            ,?\ ++and\ {NAME_PATTERN}(
                ,\ {NAME_PATTERN}
                |
                ,?\ ++and\ {NAME_PATTERN}
            )?
        )?
    )?\s*
'''.format_map(locals())
BYLINE_PATTERN_SEARCH = regex_compile(BYLINE_PATTERN, VERBOSE | IGNORECASE)

NORMALIZE_ANDS = regex_compile(r'\s++and\s++', IGNORECASE).sub
NORMALIZE_COMMA_SPACES = regex_compile(r'\s*+,\s++', IGNORECASE).sub
BY_PREFIX = regex_compile(
    r'''
    ^(?:
        (?>
            [^b]++
            |
            (?<!\b)b
            |b(?!y)
        )*+
        \bby\s++
    )?
    ([^\r\n]++)
"""Define the ExternalLink class."""

from typing import Optional, List

from regex import compile as regex_compile

from ._wikitext import SubWikiText, BRACKET_EXTERNAL_LINK_URL

URL_MATCH = regex_compile(BRACKET_EXTERNAL_LINK_URL).match


class ExternalLink(SubWikiText):

    __slots__ = ()

    @property
    def url(self) -> str:
        """URL of the current ExternalLink object.

        getter: Return the URL.
        setter: Set a new value for URL. Convert add brackets for bare
            external links.
        """
        if self(0) == '[':
            return self(1, URL_MATCH(self._ext_link_shadow, 1).end())
        return self.string

    @url.setter
    def url(self, newurl: str) -> None:
        if self(0) == '[':
            self[1:len('[' + self.url)] = newurl
Exemple #29
0
import logging
from threading import Thread
from datetime import date
from urllib.parse import urlparse

from regex import compile as regex_compile
from requests import ConnectionError as RequestsConnectionError

from lib.commons import dict_to_sfn_cit_ref
from lib.urls import (urls_scr, url2dict, get_home_title, get_html,
                      find_authors, find_journal, find_site_name, find_title,
                      ContentTypeError, ContentLengthError, StatusCodeError,
                      TITLE_TAG)

URL_FULLMATCH = regex_compile(
    r'https?+://web(?:-beta)?+\.archive\.org/(?:web/)?+'
    r'(\d{4})(\d{2})(\d{2})\d{6}(?>cs_|i(?>d_|m_)|js_)?+/(http.*)').fullmatch


def waybackmachine_scr(archive_url: str,
                       date_format: str = '%Y-%m-%d') -> tuple:
    """Create the response namedtuple."""
    m = URL_FULLMATCH(archive_url)
    if not m:
        # Could not parse the archive_url. Treat as an ordinary URL.
        return urls_scr(archive_url, date_format)
    archive_year, archive_month, archive_day, original_url = \
        m.groups()
    original_dict = {}
    thread = Thread(target=original_url2dict,
                    args=(original_url, original_dict))
Exemple #30
0
jB_TO_NUM = {
    'فروردین': 1,
    'اردیبهشت': 2,
    'خرداد': 3,
    'تیر': 4,
    'مرداد': 5,
    'شهریور': 6,
    'مهر': 7,
    'آبان': 8,
    'آذر': 9,
    'دی': 10,
    'بهمن': 11,
    'اسفند': 12
}

DOUBLE_DIGIT_SEARCH = regex_compile(r'\d\d').search

# Date patterns:

# January|February...
B = (r'''
    (?<B>(?:J(?:anuary|u(?:ne|ly))
    |
    February
    |
    Ma(?:rch|y)
    |
    A(?:pril|ugust)
    |
    (?:(?:(?:Sept|Nov|Dec)em)|Octo)ber))
    ''')
Exemple #31
0
#! /usr/bin/python
# -*- coding: utf-8 -*-

"""Codes specifically related to Noormags website."""

from threading import Thread

from regex import compile as regex_compile

from lib.commons import dict_to_sfn_cit_ref, request
from lib.bibtex import parse as bibtex_parse
from lib.ris import parse as ris_parse


BIBTEX_ARTICLE_ID_SEARCH = regex_compile(r'(?<=/citation/bibtex/)\d+').search
RIS_ARTICLE_ID_SEARCH = regex_compile(r'(?<=/citation/ris/)\d+').search


def noormags_sfn_cit_ref(url: str, date_format: str = '%Y-%m-%d') -> tuple:
    """Create the response namedtuple."""
    ris_collection = {}
    ris_thread = Thread(target=ris_fetcher_thread, args=(url, ris_collection))
    ris_thread.start()
    dictionary = bibtex_parse(get_bibtex(url))
    dictionary['date_format'] = date_format
    # language parameter needs to be taken from RIS
    # other information are more accurate in bibtex
    # for example: http://www.noormags.ir/view/fa/articlepage/104040
    # "IS  - 1" is wrong in RIS but "number = { 45 }," is correct in bibtex
    ris_thread.join()
    dictionary.update(ris_collection)
Exemple #32
0
"""Define the functions required for parsing wikitext into spans."""

from typing import Dict, List, Callable, Any, Optional

from regex import VERBOSE, IGNORECASE
from regex import compile as regex_compile

# According to https://www.mediawiki.org/wiki/Manual:$wgLegalTitleChars
# illegal title characters are: r'[]{}|#<>[\u0000-\u0020]'
VALID_TITLE_CHARS_PATTERN = rb'[^\x00-\x1f\|\{\}\[\]<>\n]++'
# Templates
TEMPLATE_FINDITER = regex_compile(
    rb'\{\{\s*+'
    # name
    + VALID_TITLE_CHARS_PATTERN + rb'''
    \s*+
    (?>\|[^{}]*+)?+  # optional args
    \}\}''',
    VERBOSE,
).finditer
INVALID_TL_NAME_FINDITER = regex_compile(
    rb'''
    \{\{
    [\s_]*+ # invalid name
    (?>\|[^{}]*)?+  # optional args
    \}\}
    ''',
    VERBOSE,
).finditer
# Parameters
PARAMETER_FINDITER = regex_compile(
Exemple #33
0
from ._spans import ATTRS_MATCH
from ._tag import SubWikiTextWithAttrs
from ._wikitext import WS

CAPTION_MATCH = regex_compile(
    rb"""
    # Everything until the caption line
    (?P<preattrs>
        # Start of table
        {\|
        (?:
            (?:
                (?!\n\s*+\|)
                [\s\S]
            )*?
        )
        # Start of caption line
        \n\s*+\|\+
    )
    # Optional caption attrs
    (?:
        (?P<attrs>[^\n|]*+)
        \|(?!\|)
    )?
    (?P<caption>.*?)
    (?:\n[\|\!]|\|\|)
    """, DOTALL | VERBOSE).match
T = TypeVar('T')

HEAD_DIGITS = regex_compile(rb'\s*+\d+').match
Exemple #34
0
from regex import VERBOSE, IGNORECASE
from regex import compile as regex_compile

from ._config import (_parsable_tag_extensions, regex_pattern,
                      _unparsable_tag_extensions, _bare_external_link_schemes,
                      _parser_functions, _HTML_TAG_NAME)

# According to https://www.mediawiki.org/wiki/Manual:$wgLegalTitleChars
# illegal title characters are: r'[]{}|#<>[\u0000-\u0020]'
VALID_TITLE_CHARS_PATTERN = rb'[^\x00-\x1f\|\{\}\[\]<>\n]++'
# Parameters
# Parser functions
# According to https://www.mediawiki.org/wiki/Help:Magic_words
# See also:
# https://translatewiki.net/wiki/MediaWiki:Sp-translate-data-MagicWords/fa
PARAMS_FINDITER = regex_compile(
    rb'\{\{\{(?>[^{}]*+|}(?!})|{(?!{))*+\}\}\}').finditer
PF_TL_FINDITER = regex_compile(rb'\{\{'
                               rb'(?>'
                               # parser function
                               rb'\s*+'
                               rb'(?>\#[^{}\s:]++|' +
                               regex_pattern(_parser_functions).encode()[3:] +
                               # end of generated part
                               rb':(?>[^{}]*+|}(?!})|{(?!{))*+\}\}()'
                               rb'|'
                               # invalid template name
                               rb'[\s_]*+'  # invalid name
                               rb'(?:\|(?>[^{}]++|{(?!{)|}(?!}))*+)?+'  # args
                               rb'\}\}()'
                               rb'|'
                               # template
Exemple #35
0
from langid import classify
from regex import compile as regex_compile, DOTALL

from config import LANG
from lib.ketabir import url2dictionary as ketabir_url2dictionary
from lib.ketabir import isbn2url as ketabir_isbn2url
from lib.bibtex import parse as bibtex_parse
from lib.commons import dict_to_sfn_cit_ref, request  # , Name
from lib.ris import parse as ris_parse

# original regex from:
# https://www.debuggex.com/r/0Npla56ipD5aeTr9
# https://www.debuggex.com/r/2s3Wld3CVCR1wKoZ
ISBN_10OR13_SEARCH = regex_compile(
    r'97[89]([ -]?+)(?=\d{1,5}\1?+\d{1,7}\1?+\d{1,6}\1?+\d)(?:\d\1*){9}\d'
    r'|(?=\d{1,5}([ -]?+)\d{1,7}\1?+\d{1,6}\1?+\d)(?:\d\1*+){9}[\dX]').search

ISBN10_SEARCH = regex_compile(
    r'(?=\d{1,5}([ -]?+)\d{1,7}\1?+\d{1,6}\1?+\d)(?:\d\1*+){9}[\dX]').search

ISBN13_SEARCH = regex_compile(
    r'97[89]([ -]?+)(?=\d{1,5}\1?+\d{1,7}\1?+\d{1,6}\1?+\d)(?:\d\1*+){9}\d'
).search

# original regex from: http://stackoverflow.com/a/14260708/2705757
# ISBN_REGEX = regex_compile(
#     r'(?=[-0-9 ]{17}|[-0-9X ]{13}|[0-9X]{10})(?:97[89][- ]?)'
#     r'?[0-9]{1,5}[- ]?(?:[0-9]+[- ]?){2}[0-9X]'
# )
Exemple #36
0
from urllib.parse import unquote
from html import unescape

from langid import classify
from regex import compile as regex_compile, VERBOSE

from lib.commons import dict_to_sfn_cit_ref, request
from config import LANG

# The regex is from:
# http://stackoverflow.com/questions/27910/finding-a-doi-in-a-document-or-page
DOI_SEARCH = regex_compile(
    r'''
    \b(
        10\.[0-9]{4,}+
        (?:\.[0-9]++)*+
        /[^"&\'\s]++
    )\b
    ''',
    VERBOSE,
).search


def doi_scr(doi_or_url, pure=False, date_format='%Y-%m-%d') -> tuple:
    """Return the response namedtuple."""
    if pure:
        doi = doi_or_url
    else:
        # unescape '&amp;', '&lt;', and '&gt;' in doi_or_url
        # decode percent encodings
        decoded_url = unquote(unescape(doi_or_url))
        doi = DOI_SEARCH(decoded_url)[1]
Exemple #37
0
    VALID_EXTLINK_CHARS,
    BARE_EXTLINK_SCHEMES_PATTERN,
)


# External links (comment inclusive)
BRACKET_EXTERNALLINK_PATTERN = (
    rb'\[(?>//|' + BARE_EXTLINK_SCHEMES_PATTERN + rb')'
    + VALID_EXTLINK_CHARS + rb'\ *+[^\]\n]*+\]'
)
BARE_EXTERNALLINK_PATTERN = (
    rb'(?>' + BARE_EXTLINK_SCHEMES_PATTERN + rb')' + VALID_EXTLINK_CHARS
)
EXTERNALLINK_FINDITER = regex_compile(
    rb'(?:' + BARE_EXTERNALLINK_PATTERN
    + rb'|' + BRACKET_EXTERNALLINK_PATTERN + rb')',
    IGNORECASE,
).finditer

# Sections
SECTIONS_FULLMATCH = regex_compile(
    rb'''
    (?<section>.*?)
    (?<section>
        ^(?<eq>={1,6})[^\n]+?(?P=eq)[ \t]*+$  # header
        .*?
    )*  # todo: why can't be made possessive?
    ''',
    DOTALL | MULTILINE | VERBOSE,
).fullmatch
Exemple #38
0
"""Define the Comment class."""
from typing import Dict, List, MutableSequence, Optional, Union

from regex import compile as regex_compile

from ._wikitext import SubWikiText
from ._spans import COMMENT_PATTERN


COMMA_COMMENT = "'(?>" + COMMENT_PATTERN + ")*+"
COMMENT_COMMA = "(?>" + COMMENT_PATTERN + ")*+'"
BOLD_FULLMATCH = regex_compile(
    COMMA_COMMENT * 2 + "'(.*)'" + COMMENT_COMMA * 2).fullmatch
ITALIC_FULLMATCH = regex_compile(
    COMMA_COMMENT + "'(.*)'" + COMMENT_COMMA).fullmatch
ITALIC_NOEND_FULLMATCH = regex_compile(
    COMMA_COMMENT + "'(.*)").fullmatch


class Comment(SubWikiText):
    __slots__ = ()

    @property
    def contents(self) -> str:
        """Return contents of this comment."""
        return self(4, -3)

    @property
    def comments(self) -> List['Comment']:
        return []
Exemple #39
0
 def tags(self, name=None) -> List['Tag']:
     """Return all tags with the given name."""
     lststr = self._lststr
     type_to_spans = self._type_to_spans
     if name:
         if name in _tag_extensions:
             string = lststr[0]
             return [
                 Tag(lststr, type_to_spans, span, 'ExtensionTag')
                 for span in type_to_spans['ExtensionTag']
                 if string.startswith('<' + name, span[0])
             ]
         tags = []  # type: List['Tag']
         tags_append = tags.append
     else:
         # There is no name, add all extension tags. Before using shadow.
         tags = [
             Tag(lststr, type_to_spans, span, 'ExtensionTag')
             for span in type_to_spans['ExtensionTag']
         ]
         tags_append = tags.append
     # Get the left-most start tag, match it to right-most end tag
     # and so on.
     ss = self._span[0]
     shadow = self._shadow
     if name:
         # There is a name but it is not in TAG_EXTENSIONS.
         reversed_start_matches = reversed([m for m in regex_compile(
             START_TAG_PATTERN.replace(
                 rb'{name}', rb'(?P<name>' + name.encode() + rb')'
             )
         ).finditer(shadow)])
         end_search = regex_compile(END_TAG_PATTERN .replace(
             b'{name}', name.encode()
         )).search
     else:
         reversed_start_matches = reversed(
             [m for m in START_TAG_FINDITER(shadow)]
         )
     shadow_copy = shadow[:]
     spans = type_to_spans.setdefault('Tag', [])
     span_tuple_to_span_get = {(s[0], s[1]): s for s in spans}.get
     spans_append = spans.append
     for start_match in reversed_start_matches:
         if start_match['self_closing']:
             # Don't look for the end tag
             s, e = start_match.span()
             span = [ss + s, ss + e]
         else:
             # look for the end-tag
             if name:
                 # the end_search is already available
                 # noinspection PyUnboundLocalVariable
                 end_match = end_search(shadow_copy, start_match.end())
             else:
                 # build end_search according to start tag name
                 end_match = search(
                     END_TAG_PATTERN.replace(
                         b'{name}', start_match['name']
                     ),
                     shadow_copy,
                 )
             if end_match:
                 s, e = end_match.span()
                 shadow_copy[s:e] = b'_' * (e - s)
                 span = [ss + start_match.start(), ss + e]
             else:
                 # Assume start-only tag.
                 s, e = start_match.span()
                 span = [ss + s, ss + e]
         old_span = span_tuple_to_span_get((span[0], span[1]))
         if old_span is None:
             spans_append(span)
         else:
             span = old_span
         tags_append(Tag(lststr, type_to_spans, span, 'Tag'))
     return tags
from pytest import mark
from regex import compile as regex_compile, VERBOSE, IGNORECASE

from lib.urls_authors import byline_to_names, BYLINE_PATTERN, \
    BYLINE_TAG_FINDITER

BYLINE_PATTERN_REGEX = regex_compile(
    fr'^{BYLINE_PATTERN}$',
    IGNORECASE | VERBOSE)


def test_byline_pattern_one_author():
    """http://www.defense.gov/News/NewsArticle.aspx?ID=18509"""
    assert BYLINE_PATTERN_REGEX.search('By Jim Garamone')


def test_byline_pattern_cap_names_joined_by_and():
    """Test two authors with and.

    Example:
    https://www.eff.org/deeplinks/2014/06/
    sudan-tech-sanctions-harm-innovation-development-us-government-and-
    corporations-must-act

    Note the two consecutive spaces.

    """
    assert BYLINE_PATTERN_REGEX.search('By Kimberly Carlson  and Jillian York')


def test_byline_pattern_four_authors():
Exemple #41
0
#! /usr/bin/python
# -*- coding: utf-8 -*-

"""Codes specifically related to Noormags website."""

from regex import compile as regex_compile

from lib.commons import dict_to_sfn_cit_ref, request
from lib.bibtex import parse as bibtex_parse


BIBTEX_ARTICLE_ID_SEARCH = regex_compile(
    r'(?<=CitationHandler\.ashx\?id=)\d+').search
RIS_ARTICLE_ID_SEARCH = regex_compile(r'(?<=RIS&id=)\d+').search


def noorlib_sfn_cit_ref(url: str, date_format: str = '%Y-%m-%d') -> tuple:
    """Create the response namedtuple."""
    dictionary = bibtex_parse(get_bibtex(url))
    dictionary['date_format'] = date_format
    # risr = get_ris(url)[1]
    # dictionary = risr.parse(ris)[1]
    return dict_to_sfn_cit_ref(dictionary)


def get_bibtex(noorlib_url):
    """Get bibtex file content from a noormags url. Return as string."""
    pagetext = request(noorlib_url).text
    article_id = BIBTEX_ARTICLE_ID_SEARCH(pagetext)[0]
    url = 'http://www.noorlib.ir/View/HttpHandler/CitationHandler.ashx?id=' +\
          article_id + '&format=BibTex'