Esempio n. 1
0
def run(argv):

    if not should_replace(cpra_to_rsids_trie_fname) and not should_replace(
            rsid_to_cpra_trie_fname):
        print('tries are up-to-date!')

    else:
        with open(os.path.join(conf.data_dir, 'sites', 'sites.tsv'),
                  'rt') as f:
            lines = [parse_line(line) for line in f]
        print('done loading.')

        cpra_to_rsids_trie = marisa_trie.BytesTrie(
            lines, order=marisa_trie.LABEL_ORDER)
        cpra_to_rsids_trie.save(cpra_to_rsids_trie_fname)
        print('done with cpra->rsids trie at ' + cpra_to_rsids_trie_fname)

        # TODO: What if several different chrom-pos-ref-alts have the same rsid?  Do we only get the first? Or the last?
        reversed_lines = ((rsid.decode(), cpra.encode())
                          for (cpra, rsids) in lines
                          for rsid in rsids.split(b','))
        rsid_to_cpra_trie = marisa_trie.BytesTrie(
            reversed_lines, order=marisa_trie.LABEL_ORDER)
        rsid_to_cpra_trie.save(rsid_to_cpra_trie_fname)
        print('done with rsid->cpra trie at ' + rsid_to_cpra_trie_fname)
Esempio n. 2
0
def run(argv):

    if '-h' in argv or '--help' in argv:
        print('Make tries for converting between chr-pos-ref-alt and rsid')
        exit(1)

    if not should_replace(cpra_to_rsids_trie_filepath) and not should_replace(rsid_to_cpra_trie_filepath):
        print('tries are up-to-date!')

    else:
        # Note: two identical VariantFileReaders are made in order to allow streaming to reduce memory usage.
        #       a different trie library might allow feeding both tries while reading from a file, but marisa_trie doesn't.
        with VariantFileReader(sites_filepath) as reader:
            cpras_and_rsids = (('{chrom}-{pos}-{ref}-{alt}'.format(**v), v['rsids'].encode('ascii')) for v in reader)
            cpra_to_rsids_trie = marisa_trie.BytesTrie(cpras_and_rsids, order=marisa_trie.LABEL_ORDER)
        cpra_to_rsids_trie.save(cpra_to_rsids_trie_filepath)
        print('done with cpra->rsids trie at ' + cpra_to_rsids_trie_filepath)

        # Note: if several different chrom-pos-ref-alts have the same rsid, then `trie[rsid]` = `[cpra1, cpra2, ...]`.
        with VariantFileReader(sites_filepath) as reader:
            def get_rsids_and_cpras():
                for v in reader:
                    if v['rsids']:
                        cpra = '{chrom}-{pos}-{ref}-{alt}'.format(**v).encode('ascii')
                        for rsid in v['rsids'].split(','):
                            yield (rsid, cpra)
            rsid_to_cpra_trie = marisa_trie.BytesTrie(get_rsids_and_cpras(), order=marisa_trie.LABEL_ORDER)
        rsid_to_cpra_trie.save(rsid_to_cpra_trie_filepath)
        print('done with rsid->cpra trie at ' + rsid_to_cpra_trie_filepath)
Esempio n. 3
0
def run(argv):

    if not should_replace(cpra_to_rsids_trie_filepath) and not should_replace(
            rsid_to_cpra_trie_filepath):
        print('tries are up-to-date!')

    else:
        with VariantFileReader(sites_filepath) as reader:
            cpras_and_rsids = [('{chrom}-{pos}-{ref}-{alt}'.format(**v),
                                v['rsids'].encode('ascii')) for v in reader]
        print('done loading.')

        cpra_to_rsids_trie = marisa_trie.BytesTrie(
            cpras_and_rsids, order=marisa_trie.LABEL_ORDER)
        cpra_to_rsids_trie.save(cpra_to_rsids_trie_filepath)
        print('done with cpra->rsids trie at ' + cpra_to_rsids_trie_filepath)

        # TODO: What if several different chrom-pos-ref-alts have the same rsid?  Do we only get the first? Or the last?
        rsids_and_cpras = ((rsid, cpra.encode('ascii'))
                           for (cpra, rsids) in cpras_and_rsids
                           for rsid in rsids.decode('ascii').split(',')
                           if rsid)
        rsid_to_cpra_trie = marisa_trie.BytesTrie(
            rsids_and_cpras, order=marisa_trie.LABEL_ORDER)
        rsid_to_cpra_trie.save(rsid_to_cpra_trie_filepath)
        print('done with rsid->cpra trie at ' + rsid_to_cpra_trie_filepath)
Esempio n. 4
0
def create_trie():
    keys = []
    values = []
    start = time.time()
    db = Database(None)
    print "Connected to DB in", time.time() - start, "secs"
    start = time.time()
    db.execute("SELECT * FROM `Final_consolidated`")
    print "Retrieved Info from table in", time.time() - start, "secs"
    for row in db.cursor:
        value = []
        for i, ele in enumerate(row):
            if i == 2:
                keys.append(ele.lower())
            else:
                value.append(ele)
        values.append(json.dumps(value))
    trie = marisa_trie.Trie()
    print "Starting to zip into Marisa trie"
    trie = marisa_trie.BytesTrie(zip(keys, values))
    print "Completed zipping into Marisa trie"
    print "Pickling Trie"
    with open('consolidated.pkl', 'wb') as output:
        pickle.dump(trie, output, pickle.HIGHEST_PROTOCOL)
    print "Total number of Keys:", len(keys)
Esempio n. 5
0
def test_iteritems(keys, values):
    trie = marisa_trie.BytesTrie(zip(keys, values))
    assert trie.items() == list(trie.iteritems())

    for key in keys:
        prefix = key[:5]
        assert trie.items(prefix) == list(trie.iteritems(prefix))
Esempio n. 6
0
    def create_from_other(self, compdawg):
        d = count_prefixes(compdawg)
        d = dict([(k, v.pop()) for k, v in d.iteritems() if len(v) == 1])

        self.trie = marisa_trie.BytesTrie(
            ((k, v.encode("utf-8")) for k, v in d.iteritems()))
        self.dawg = dawg.BytesDAWG(
            ((k, v.encode("utf-8")) for k, v in d.iteritems()))
Esempio n. 7
0
    def _pack_multiple_nodes(keys, nodes):
        values = []
        for node in nodes:
            values.append(bytes(node.pack_to_string(), encoding='utf8'))

        binary_trie = mt.BytesTrie(zip(keys, values))

        return binary_trie
Esempio n. 8
0
def test_dumps_loads(data):
    trie = marisa_trie.BytesTrie(data)

    buf = io.BytesIO()
    pickle.dump(trie, buf)
    buf.seek(0)

    assert trie == pickle.load(buf)
Esempio n. 9
0
 def test_getitem_multiple(self):
     data = [
         ('foo', b'x'),
         ('fo', b'y'),
         ('foo', b'a'),
     ]
     trie = marisa_trie.BytesTrie(data)
     assert trie['fo'] == [b'y']
     assert trie['foo'] == [b'a', b'x']
Esempio n. 10
0
    def test_getitem_fuzzy(self):
        data = self.data()
        trie = marisa_trie.BytesTrie(data)

        for key, value in data:
            assert trie[key] == [value]

        with pytest.raises(KeyError):
            trie['2135']
Esempio n. 11
0
    def test_contains(self):
        data = self.data()
        trie = marisa_trie.BytesTrie(data)

        for key, value in data:
            assert key in trie

        non_key = '2135'
        assert non_key not in trie
Esempio n. 12
0
def test_getitem_multiple():
    data = [
        ("foo", b"x"),
        ("fo", b"y"),
        ("foo", b"a"),
    ]
    trie = marisa_trie.BytesTrie(data)
    assert trie["fo"] == [b"y"]
    assert trie["foo"] == [b"a", b"x"]
Esempio n. 13
0
def load_trie(filename):
    """
    Load a BytesTrie from the marisa_trie on-disk format.
    """
    trie = marisa_trie.BytesTrie()
    # marisa_trie raises warnings that make no sense. Ignore them.
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        trie.load(filename)
    return trie
Esempio n. 14
0
def test_contains(keys, values, missing_key):
    assume(missing_key not in keys)

    data = zip(keys, values)
    trie = marisa_trie.BytesTrie(data)

    for word, value in data:
        assert word in trie

    assert missing_key not in trie
Esempio n. 15
0
    def __init__(self, phenos):
        self._phenos = copy.deepcopy(phenos)
        self._preprocess_phenos()

        self._cpra_to_rsids_trie = marisa_trie.BytesTrie().load(
            common_filepaths['cpra-to-rsids-trie'])
        self._rsid_to_cpra_trie = marisa_trie.BytesTrie().load(
            common_filepaths['rsid-to-cpra-trie'])
        self._gene_alias_trie = marisa_trie.BytesTrie().load(
            common_filepaths['gene-aliases-trie'])

        self._autocompleters = [
            self._autocomplete_variant,
            self._autocomplete_rsid,
            self._autocomplete_phenocode,
            self._autocomplete_gene,
        ]
        if any('phenostring' in pheno for pheno in self._phenos.values()):
            self._autocompleters.append(self._autocomplete_phenostring)
Esempio n. 16
0
    def test_iteritems(self):
        keys = get_random_words(1000)
        values = get_random_binary(1000)

        trie = marisa_trie.BytesTrie(zip(keys, values))
        assert trie.items() == list(trie.iteritems())

        for key in keys:
            prefix = key[:5]
            assert trie.items(prefix) == list(trie.iteritems(prefix))
Esempio n. 17
0
def build_trie(file):
    keys = []
    values = []
    with codecs.open(file, 'r', 'utf-8') as f:
        for line in f.readlines():
            parts = line.split(u'\t')
            keys.append(parts[1].strip())
            # byte tire needs bytes
            values.append(parts[0].strip().encode('utf-8'))

    return marisa_trie.BytesTrie(zip(keys, values))
Esempio n. 18
0
def test_getitem(keys, values, missing_key):
    assume(missing_key not in keys)

    data = zip(keys, values)
    trie = marisa_trie.BytesTrie(data)

    for key, value in data:
        assert trie[key] == [value]

    with pytest.raises(KeyError):
        trie[missing_key]
Esempio n. 19
0
def test_has_keys_with_prefix():
    fruit_trie = marisa_trie.BytesTrie([
        ('apple', b'foo'),
        ('pear', b'bar'),
        ('peach', b'baz'),
    ])
    assert fruit_trie.has_keys_with_prefix('')
    assert fruit_trie.has_keys_with_prefix('a')
    assert fruit_trie.has_keys_with_prefix('pe')
    assert fruit_trie.has_keys_with_prefix('pear')
    assert not fruit_trie.has_keys_with_prefix('x')
Esempio n. 20
0
    def test_null_bytes_in_values(self):
        keys = get_random_words(10000)
        values = get_random_binary(10000)

        assert any(b'\x00' in p for p in values)

        data = zip(keys, values)
        trie = marisa_trie.BytesTrie(data)

        for key, value in data:
            assert trie[key] == [value]
Esempio n. 21
0
    def test_pickling(self):
        trie = marisa_trie.BytesTrie([
            ('foo', b'foo'),
            ('bar', b'bar'),
        ])
        buf = io.BytesIO()
        pickle.dump(trie, buf)
        buf.seek(0)

        trie2 = pickle.load(buf)
        assert trie2['foo'] == [b'foo']
        assert trie2['bar'] == [b'bar']
Esempio n. 22
0
 def __init__(self, seq):
     self.nw_dst = marisa_trie.BytesTrie(zip(seq.nw_dst, seq.dl_vlan))
     self.nw_src = marisa_trie.BytesTrie(zip(seq.nw_src, seq.dl_vlan))
     self.metadata = marisa_trie.BytesTrie(zip(seq.metadata, seq.dl_vlan))
     self.in_port = marisa_trie.BytesTrie(zip(seq.in_port, seq.dl_vlan))
     self.dl_dst = marisa_trie.BytesTrie(zip(seq.dl_dst, seq.dl_vlan))
     self.dl_src = marisa_trie.BytesTrie(zip(seq.dl_src, seq.dl_vlan))
Esempio n. 23
0
def analyze(input):

    #만약 사전 컴파일 중이면 5초 기다려주기
    if is_compiling():
        time.sleep(5)

    compile_dic = marisa_trie.BytesTrie().load(COMPILED_NE_DIC_PATH)
    morphs, search_iter, search_index, morph_result = gen_prefix_array(input)

    # resultlist = []
    ne_list = []
    ne_tag_list = []
    longest = True
    for i in range(len(search_iter)):
        candi = search_iter[i]
        # print (self.searchIndex[idx])
        findlist = compile_dic.prefixes(candi)
        LOGGER.info('candi={}, findlist={}'.format(candi, findlist))
        if len(findlist) == 0:
            continue
        # 최장일치
        if longest:
            item = max(findlist, key=len)
            # print("item=", item)
            # for tag in compile_dic:
            if item in compile_dic and item not in ne_list:
                # values = [value.decode('utf-8')  for value in compile_dic[item]]
                # resultlist.append((item, values))
                for value in compile_dic[item]:
                    ne_list.append(item)
                    ne_tag_list.append(value.decode('utf-8'))

        # 최단일치
        else:
            for item in findlist:
                # print("item1=", item)
                # for tag in compile_dic:
                #    resultlist.append((item, tag))
                if item in compile_dic:
                    # values = [value.decode('utf-8') for value in compile_dic[item]]
                    # resultlist.append((item, values))
                    for value in compile_dic[item]:
                        ne_list.append(item)
                        ne_tag_list.append(value.decode('utf-8'))
    # 개체명 분석 결과 구조체
    ne_result = {
        'ne': ne_list
        , 'neTags': ne_tag_list
        , 'neNum': len(ne_list)
        , 'morphResult': morph_result
    }
    return ne_result
Esempio n. 24
0
 def test_items(self):
     data = [
         ('fo', b'y'),
         ('foo', b'x'),
         ('foo', b'a'),
     ]
     trie = marisa_trie.BytesTrie(data)
     assert set(trie.items()) == set(data)
     assert set(trie.items('f')) == set(data)
     assert set(trie.items('fo')) == set(data)
     assert set(trie.items('foo')) == set(data[1:])
     assert trie.items('food') == []
     assert trie.items('bar') == []
Esempio n. 25
0
def test_items():
    data = [
        ("fo", b"y"),
        ("foo", b"x"),
        ("foo", b"a"),
    ]
    trie = marisa_trie.BytesTrie(data)
    assert set(trie.items()) == set(data)
    assert set(trie.items("f")) == set(data)
    assert set(trie.items("fo")) == set(data)
    assert set(trie.items("foo")) == set(data[1:])
    assert trie.items("food") == []
    assert trie.items("bar") == []
Esempio n. 26
0
 def _read_ipa2xs(self):
     path = os.path.join('data', self.ipa2xs_fn)
     path = pkg_resources.resource_filename(__name__, path)
     pairs = []
     with open(path, 'rb') as f:
         reader = csv.reader(f, encoding='utf-8')
         next(reader)
         for ipa, xs, _ in reader:
             pairs.append((
                 ipa,
                 xs.encode('utf-8'),
             ))
     trie = marisa_trie.BytesTrie(pairs)
     return trie
Esempio n. 27
0
    def test_keys(self):
        trie = marisa_trie.BytesTrie([
            ('foo', b'x'),
            ('fo', b'y'),
            ('foo', b'a'),
        ])

        # FIXME: ordering?
        assert trie.keys() == ['foo', 'foo', 'fo']
        assert trie.keys('f') == ['foo', 'foo', 'fo']
        assert trie.keys('fo') == ['foo', 'foo', 'fo']
        assert trie.keys('foo') == ['foo', 'foo']
        assert trie.keys('food') == []
        assert trie.keys('bar') == []
Esempio n. 28
0
def test_keys():
    trie = marisa_trie.BytesTrie([
        ("foo", b"x"),
        ("fo", b"y"),
        ("foo", b"a"),
    ])

    # FIXME: ordering?
    assert trie.keys() == ["foo", "foo", "fo"]
    assert trie.keys("f") == ["foo", "foo", "fo"]
    assert trie.keys("fo") == ["foo", "foo", "fo"]
    assert trie.keys("foo") == ["foo", "foo"]
    assert trie.keys("food") == []
    assert trie.keys("bar") == []
Esempio n. 29
0
def Main(argv):
    # ArgParser
    parser = argparse.ArgumentParser(usage=PrintHelp())
    parser.add_argument("marisa",
                        help="Cesta ku slovníku vo formáte marisa_trie.")

    args = parser.parse_args()
    # Nacitanie trie
    trie = marisa_trie.BytesTrie()
    trie.load(args.marisa)
    print("Morphological analyser")
    # Vstup
    for line in sys.stdin:
        for word in line.split():
            if (word == "."):
                sys.exit()
            PrintTrie(trie, word)
Esempio n. 30
0
def run(argv):

    if not os.path.exists(common_filepaths['genes']):
        print('Downloading genes')
        from . import download_genes
        download_genes.run([])

    aliases_filepath = common_filepaths['gene-aliases-trie']
    if not os.path.exists(aliases_filepath):
        print('gene aliases will be stored at {!r}'.format(aliases_filepath))
        mapping = get_gene_aliases()
        mapping = [(a, cs.encode('ascii')) for a, cs in mapping.items()]
        aliases_trie = marisa_trie.BytesTrie(mapping)
        aliases_trie.save(aliases_filepath)

    else:
        print('gene aliases are at {!r}'.format(aliases_filepath))