예제 #1
0
def make_hash(body):
    body = re.sub('[^a-z]', '', body.lower())

    if body == '':
        return None

    return CityHash64(body)
예제 #2
0
def city_hash_uint64(string):
    if type(string) is not str:
        string = str(string)
    out = CityHash64(string) & 0xffffffff
    if (out > 0x7fFFffFF):
        out -= 0x100000000
    return out
예제 #3
0
    def shingle(self, words):
        shingles = set()
        for index in range(0, len(words) - (self.shingle_size - 1)):
            shingle = " ".join(words[index:index + self.shingle_size])
            shingles.add(str(CityHash64(shingle)))

        return shingles
예제 #4
0
 def set_motifs(self, motifs):
     self.motifs = motifs
     self.motif_ids = [m.id for m in read_motifs(open(motifs))]
     self.checksum = {}
     if self.use_cache:
         chksum = CityHash64("\n".join(sorted(self.motif_ids)))
         self.checksum[self.motifs] = chksum
예제 #5
0
def inspect_packet(p):
    aux_id = str(p.src_ip) + str(p.dst_ip) + str(p.src_port) + str(p.dst_port)
    aux_id_revr = str(p.dst_ip) + str(p.src_ip) + str(p.dst_port) + str(
        p.src_port)
    connId = CityHash64(aux_id)
    connId_revr = CityHash64(aux_id_revr)

    if debug_showdata == True:
        print 'ConnId:  ' + str(connId)
        print 'ConnId_r:  ' + str(connId_revr)
        if p.direction == const.INBOUND:
            print "packet inbound"
        else:
            print "packet outbound"

    #Access to these global variables
    global flows
    global connNum
    global learn_protocol_on
    global detect_protocol_on

    #pdb.set_trace()
    if connId in flows:
        update_flow(flows[connId], p)
        if learn_protocol_on:
            learning(flows[connId])
        elif detect_protocol_on:
            inspect_flow(flows[connId])
        return False
    elif connId not in flows and connId_revr in flows:
        update_flow(flows[connId_revr], p)
        if learn_protocol_on:
            learning(flows[connId_revr])
        elif detect_protocol_on:
            inspect_flow(flows[connId_revr])
        return False
    else:
        #New connection - We add first packet (p) to new flow (flow)
        flow = types.Flow()
        update_flow(flow, p)
        flows.update({connId: flow})
        #Adding 5tuple info to the flow
        add_5tuple_info(flows[connId], p)
        print "New connection established. ConnId: %i" % (connId)
        return True
    """
예제 #6
0
    def _scan_sequences(self, seqs, nreport, scan_rc, cutoff=0.95):

        motif_file = self.motifs
        motif_digest = self.checksum.get(motif_file, None)

        scan_seqs = seqs
        if self.use_cache:
            # determine which sequences are not in the cache
            hashes = dict([(s.upper(), CityHash64(s.upper())) for s in seqs])
            scan_seqs = []

            for seq, seq_hash in hashes.items():
                key = str((seq_hash, motif_digest, nreport, scan_rc, cutoff))
                ret = self.cache.get(key)
                if ret == NO_VALUE or ret is None:
                    scan_seqs.append(seq.upper())

        # scan the sequences that are not in the cache
        if len(scan_seqs) > 0:
            n = 12
            motifs = load_motifs(motif_file, cutoff)
            scan_func = partial(scan_seq_mult,
                                motifs=motifs,
                                nreport=nreport,
                                scan_rc=scan_rc)

            jobs = []
            chunksize = len(scan_seqs) / n + 1
            for i in range(n):
                job = pool.apply_async(
                    scan_func,
                    (scan_seqs[i * chunksize:(i + 1) * chunksize], ))
                jobs.append(job)

            # store values in cache
            i = 0
            for job in jobs:
                for ret in job.get():
                    if self.use_cache:
                        h = hashes[scan_seqs[i]]
                        key = str((h, motif_digest, nreport, scan_rc, cutoff))
                        self.cache.set(key, ret)
                    else:
                        yield ret
                    i += 1

        if self.use_cache:
            # return results from cache
            for seq in seqs:
                key = str((hashes[seq.upper()], motif_digest, nreport, scan_rc,
                           cutoff))
                ret = self.cache.get(key)
                if ret == NO_VALUE or ret is None:
                    raise Exception("cache is not big enough to hold all "
                                    "results, try increasing the cache size "
                                    "or disable cache")

                yield ret
예제 #7
0
def chash(obj):
    """Convenience function for calling CityHash64

    :param obj: input string/hashable object
    :type obj: object
    :return: integer
    :rtype: int
    """
    return long2int(CityHash64(obj))
예제 #8
0
def _create_hash(string: str) -> str:
    """Create CityHash64 bit hash of string

    Args:
        string (str): string to create CityHash64 from

    Returns:
        str: CityHash64
    """

    return str(CityHash64(string))
예제 #9
0
def generate_strings() -> Iterable[Tuple[str, int]]:
    pos = 0
    while True:
        pos += (500000000 + (10000000 - random.randint(1, 20000000)))
        shift = pos
        ret = []
        while shift > 0:
            ret.insert(0, ALPHABET[shift % (END - START)])
            shift = floor(shift / (END - START))
        id = "".join(ret)
        yield (id, CityHash64(id))
예제 #10
0
    def add(self, bot, update, args):
        """
        Adds a rss subscription to user
        """

        telegram_user = update.message.from_user

        if len(args) != 2:
            message = "Sorry! I could not add the entry! Please use the the command passing the following arguments:\n\n /add <url> <entryname> \n\n Here is a short example: \n\n /add http://www.feedforall.com/sample.xml ExampleEntry"
            update.message.reply_text(message)
            return

        arg_url = args[0]
        arg_entry = args[1]

        # Check if argument matches url format
        feed=FeedHandler.is_parsable(url=arg_url)
        if not feed:
            message = f"Sorry! It seems like {arg_url}" + \
                "' doesn't provide an RSS news feed... Have you tried another URL from that provider?"
            update.message.reply_text(message)
            return

        # Check if entry does not exists
        entries = self.db.get_urls_for_user(telegram_id=telegram_user.id)
        if any(arg_url.lower() in entry.lower() for entry,_ in entries):
            message = f"Sorry, {telegram_user.first_name}" + \
                "! I already have that url stored in your subscriptions."
            update.message.reply_text(message)
            return

        if any(arg_entry in entry for entry in entries):
            message = f"Sorry! I already have an entry with name {arg_entry}" + \
                " stored in your subscriptions.. Please choose another entry name or delete the entry using '/remove {arg_entry}'"
            update.message.reply_text(message)
            return

        urls = self.db.get_all_urls()
        if not (arg_url in urls):
            items = {}
            for item in feed:
                for key in ['summary', 'title', 'link']:
                    if not(key in item.keys()):
                        item[key]=''
                hash=CityHash64(item['summary']+item['title']+item['link'])
                items[hash] = {'active': True, 'last_date': DateHandler.get_datetime_now()}
            self.db.add_url(url=arg_url, items=items)

        self.db.add_user_bookmark(telegram_id=telegram_user.id, url=arg_url, alias=arg_entry)
        message = f"I successfully added {arg_entry} to your subscriptions!"
        update.message.reply_text(message)
예제 #11
0
 def _fill_example_ids(self, eids, slide_cache):
     sampled_hids = []
     max_idx = -1
     for idx, eid in eids:
         if idx <= slide_cache.get_stats_index():
             continue
         max_idx = idx
         hkey = '{}{}'.format(self._hash_prefix, eid)
         hid = CityHash64(hkey)
         if hid % JoinerStats._SampleRateReciprocal == 0:
             sampled_hids.append(hid)
     if max_idx >= 0:
         return slide_cache.fill_hash_ids(max_idx, sampled_hids)
     return []
예제 #12
0
    def update_feed(self, url):
        try:
            feed = FeedHandler.parse_feed(url[0])
        except:
            feed = False
            traceback.print_exc()  # ???

        if feed:
            print(f'{url[0]}:')
            print(f'Longitud de feed: {len(feed)}')
            url_items = self.db.get_url_items(url=url[0])
            for item in url_items:
                url_items[item]['active'] = False

            new_items = []
            for item in feed:
                hash = str(
                    CityHash64(item['summary'] + item['title'] + item['link']))
                if not (hash in url_items):
                    new_items.append(item)
                url_items[hash] = {
                    'active': True,
                    'last_date': DateHandler.get_datetime_now()
                }

            for item, value in url_items.copy().items():
                if not value['active']:
                    print(f'Desactivando {item}')
                if not value['active'] and DateHandler.is_older_than_days(
                        value['last_date'], 5):
                    print(f'Borrando {item}')
                    url_items.pop(item)

            self.db.update_url_items(url=url[0], items=url_items)

        telegram_users = self.db.get_users_for_url(url=url[0])

        for user in telegram_users:
            if user[6]:  # is_active
                if not feed:
                    message = "Something went wrong when I tried to parse the URL: \n\n " + \
                        url[0] + "\n\nCould you please check that for me? Remove the url from your subscriptions using the /remove command, it seems like it does not work anymore!"
                    self.bot.send_message(chat_id=user[0],
                                          text=message,
                                          parse_mode=ParseMode.HTML)
                    return

                for post in new_items:
                    self.send_message(post=post, user=user)
예제 #13
0
def hash64(to_hash: str) -> int:
    """
    Create int64 hash from input. Compatible with postgres BIGINT as the hash is signed.

    Parameters
    ----------
    to_hash : str
        String to hash

    Returns
    -------
    int
        Hashed input as int64
    """
    return CityHash64(to_hash) - ((2**63) - 1)
예제 #14
0
def compute_hash64(text: Union[str, pd.Series]) -> Union[int, pd.Series]:
    """Compute consistent 64-bit signed integer for text or texts"""
    if isinstance(text, str):
        text_hash = c_long(CityHash64(text)).value
    elif isinstance(text, pd.Series):
        text_hash = text.apply(CityHash64)
        text_hash = text_hash.astype(
            np.int64) if text_hash.dtype == np.uint64 else text_hash
        if text_hash.dtype != np.int64:
            raise ValueError(
                f'Computed hashes must be of type np.int64 instead of {text_hash.dtype}!'
            )
    else:
        raise TypeError(
            f"Input must of type str or Series, but got {type(text)}!")
    return text_hash
예제 #15
0
def hash_nanopub(nanopub: Mapping[str, Any]) -> str:
    """Create CityHash64 from nanopub for duplicate check

    TODO - check that this hash value is consistent between C# and Python running on
    laptop and server

    Build string to hash

    Collect flat array of (all values.strip()):
        nanopub.type.name
        nanopub.type.version

        One of:
            nanopub.citation.database.name
            nanopub.citation.database.id

            OR

            nanopub.citation.database.uri

            OR

            nanopub.citation.database.reference

        Extend with sorted list of assertions (SRO as single string with space between S, R and O)

        Extend with sorted list of annotations (nanopub.annotations.type + ' ' + nanopub.annotations.id)

    Convert array to string by joining array elements separated by a space

    Create CityHash64(str) and return

    """

    hash_list = []

    # Type
    hash_list.append(nanopub["nanopub"]["type"].get("name", "").strip())
    hash_list.append(nanopub["nanopub"]["type"].get("version", "").strip())

    # Citation
    if nanopub["nanopub"]["citation"].get("database", False):
        hash_list.append(nanopub["nanopub"]["citation"]["database"].get(
            "name", "").strip())
        hash_list.append(nanopub["nanopub"]["citation"]["database"].get(
            "id", "").strip())
    elif nanopub["nanopub"]["citation"].get("uri", False):
        hash_list.append(nanopub["nanopub"]["citation"].get("uri", "").strip())
    elif nanopub["nanopub"]["citation"].get("reference", False):
        hash_list.append(nanopub["nanopub"]["citation"].get("reference",
                                                            "").strip())

    # Assertions
    assertions = []
    for assertion in nanopub["nanopub"]["assertions"]:
        if assertion.get("relation") is None:
            assertion["relation"] = ""
        if assertion.get("object") is None:
            assertion["object"] = ""
        assertions.append(" ".join((
            assertion["subject"].strip(),
            assertion.get("relation", "").strip(),
            assertion.get("object", "").strip(),
        )).strip())
    assertions = sorted(assertions)
    hash_list.extend(assertions)

    # Annotations
    annotations = []

    for anno in nanopub["nanopub"]["annotations"]:
        annotations.append(" ".join(
            (anno.get("type", "").strip(), anno.get("id",
                                                    "").strip())).strip())

    annotations = sorted(annotations)
    hash_list.extend(annotations)

    np_string = " ".join([l.lower() for l in hash_list])

    return "{:x}".format(CityHash64(np_string))
예제 #16
0
 def _oneway_hash_list(items):
     return [hex(CityHash64(str(item)))[2:] for item in items]
예제 #17
0
def generate_uuids() -> Iterable[Tuple[str, int]]:
    while True:
        unique_id = str(uuid.uuid4())
        hash = CityHash64(unique_id)
        yield (unique_id, hash)
예제 #18
0
def hash_nanopub(nanopub: Mapping[str, Any]) -> str:
    """Create CityHash64 from nanopub for duplicate check

    TODO - check that this hash value is consistent between C# and Python running on
    laptop and server

    Build string to hash

    Collect flat array of (all values.strip()):
        nanopub.type.name
        nanopub.type.version

        One of:
            nanopub.citation.database.name
            nanopub.citation.database.id

            OR

            nanopub.citation.database.uri

            OR

            nanopub.citation.database.reference

        Extend with sorted list of assertions (SRO as single string with space between S, R and O)

        Extend with sorted list of annotations (nanopub.annotations.type + ' ' + nanopub.annotations.id)

    Convert array to string by joining array elements separated by a space

    Create CityHash64(str) and return

    """

    hash_list = []

    # Type
    hash_list.append(nanopub['nanopub']['type'].get('name', '').strip())
    hash_list.append(nanopub['nanopub']['type'].get('version', '').strip())

    # Citation
    if nanopub['nanopub']['citation'].get('database', False):
        hash_list.append(nanopub['nanopub']['citation']['database'].get('name', '').strip())
        hash_list.append(nanopub['nanopub']['citation']['database'].get('id', '').strip())
    elif nanopub['nanopub']['citation'].get('uri', False):
        hash_list.append(nanopub['nanopub']['citation'].get('uri', '').strip())
    elif nanopub['nanopub']['citation'].get('reference', False):
        hash_list.append(nanopub['nanopub']['citation'].get('reference', '').strip())

    # Assertions
    assertions = []
    for assertion in nanopub['nanopub']['assertions']:
        if assertion.get('relation') is None:
            assertion['relation'] = ''
        if assertion.get('object') is None:
            assertion['object'] = ''
        assertions.append(' '.join((assertion['subject'].strip(), assertion.get('relation', '').strip(), assertion.get('object', '').strip())).strip())
    assertions = sorted(assertions)
    hash_list.extend(assertions)

    # Annotations
    annotations = []

    for anno in nanopub['nanopub']['annotations']:
        annotations.append(' '.join((anno.get('type', '').strip(), anno.get('id', '').strip())).strip())

    annotations = sorted(annotations)
    hash_list.extend(annotations)

    np_string = ' '.join([l.lower() for l in hash_list])

    return '{:x}'.format(CityHash64(np_string))
예제 #19
0
 def partition(self, key, partitions=None):
     if not partitions:
         partitions = self.partitions
     value = CityHash64(key)
     idx = value % len(partitions)
     return partitions[idx]
예제 #20
0
파일: funcs.py 프로젝트: binh-vu/shmr
def str2hashnumber(x: str):
    return CityHash64(x)
예제 #21
0
    from cityhash import CityHash32, CityHash64, CityHash128
except ImportError:
    print("CityHash not installed, pip install cityhash")
else:
    # cityhash32
    pt = time()
    for i in range(num):
        key = b"myval-%d" % i
        CityHash32(b"%s" % key)
    print("CityHash32 took {} s".format(time() - pt))

    # cityhash64
    pt = time()
    for i in range(num):
        key = b"myval-%d" % i
        CityHash64(b"%s" % key)
    print("CityHash64 took {} s".format(time() - pt))

    # cityhash128
    pt = time()
    for i in range(num):
        key = b"myval-%d" % i
        CityHash128(b"%s" % key)
    print("CityHash128 took {} s".format(time() - pt))

# murmurhash v3
try:
    import mmh3
except ImportError:
    print("MurmurHash v3 not installed, pip install murmurhash3")
else:
예제 #22
0
def make_cityhash():
    return CityHash64(sys.argv[1] + sys.argv[2] + sys.argv[3])
예제 #23
0
 def hash_uri(uri_string):
     '''broken out 64bit hashing function, so it's easy to replace. Right
     now we use Google's fast crawler hash function CityHash, the 64 bit
     version.  Returns a 64 bit hash based on an input (uri) string.'''
     return CityHash64(uri_string)
예제 #24
0
파일: urls.py 프로젝트: derlin/swisstext
 def get_hash(text) -> str:
     return str(CityHash64(text))
예제 #25
0
from util.database import DatabaseHandler
from util.feedhandler import FeedHandler
from util.datehandler import DateHandler
from cityhash import CityHash64

db = DatabaseHandler("resources/datastore.db")

arg_url = 'http://yle.fi/uutiset/rss/paauutiset.rss'

feed = FeedHandler.is_parsable(url=arg_url)

items = {}
for item in feed:
    hash = CityHash64(item['summary'] + item['title'] + item['link'])
    if (hash in items):
        print(item['link'], item['summary'], items[hash])
    items[hash] = {
        'active': True,
        'last_date': DateHandler.get_datetime_now(),
        'link': item['link']
    }
#self.db.add_url(url=arg_url, items=items)

url_items = db.get_url_items(url=arg_url)
for item in url_items:
    url_items[item]['active'] = False

new_items = []
for item in feed:
    hash = CityHash64(item['summary'] + item['title'] + item['link'])
    if not (str(hash) in url_items):
예제 #26
0
파일: urls.py 프로젝트: derlin/swisstext
 def get_hash(text) -> str:
     """Hash the given text using
     `CityHash64 <https://opensource.googleblog.com/2011/04/introducing-cityhash.html>`_. """
     return str(CityHash64(text))
예제 #27
0
def chs64(s):
    return str(CityHash64(s))
예제 #28
0
    from cityhash import CityHash32, CityHash64, CityHash128
except ImportError:
    print('CityHash not installed, pip install cityhash')
else:
    # cityhash32
    pt = time()
    for i in range(num):
        key = b'myval-%d' % i
        CityHash32(b'%s' % key)
    print('CityHash32 took {} s'.format(time() - pt))

    # cityhash64
    pt = time()
    for i in range(num):
        key = b'myval-%d' % i
        CityHash64(b'%s' % key)
    print('CityHash64 took {} s'.format(time() - pt))

    # cityhash128
    pt = time()
    for i in range(num):
        key = b'myval-%d' % i
        CityHash128(b'%s' % key)
    print('CityHash128 took {} s'.format(time() - pt))

# murmurhash v3
try:
    import mmh3
except ImportError:
    print('MurmurHash v3 not installed, pip install murmurhash3')
else: