Ejemplo n.º 1
0
def get_key_word(data):
    output_database = []
    if len(data["entity_dict"]) >= 1:
        dicts = OrderedDict()
        for key in data["entity_dict"]:
            dicts[key] = key
            for t in data["entity_dict"][key]:
                dicts[t] = key
        query = data["query"]
        key_word_builder = AcoraBuilder(dicts.keys())
        key_word_searcher = key_word_builder.build()
        print(dicts, "------detected diccts-------")
        res = key_word_searcher.findall(query)
        print(res)
        if len(res) >= 1:
            input_entity = [item[0] for item in res]
            input_entity_key = []
            for char in input_entity:
                input_entity_key.extend(data["entity_dict"][dicts[char]])
                input_entity_key.append(dicts[char])
            input_key_entity = list(set(input_entity_key))
            key_word_builder = AcoraBuilder(input_key_entity)
            key_word_searcher = key_word_builder.build()
            for data in data["database"]:
                t = len(key_word_searcher.findall(data))
                output_database.append(t)
        else:
            for data in data["database"]:
                output_database.append(0)
    else:
        for data in data["database"]:
            output_database.append(0)
    return output_database
Ejemplo n.º 2
0
def compare_search(s, filename, ignore_case, *keywords):
    setup_pya = setup_cya = setup_re = 0
    run_pa = 'pa' in COMPARED_IMPLEMENTATIONS
    run_ca = 'ca' in COMPARED_IMPLEMENTATIONS
    run_re = 're' in COMPARED_IMPLEMENTATIONS

    if run_pa:
        t = time()
        builder = AcoraBuilder(keywords, ignore_case=ignore_case)
        py_acora = builder.build(acora=PyAcora)
        setup_pya = time() - t
        t = time()
    if run_ca:
        t = time()
        builder = AcoraBuilder(keywords, ignore_case=ignore_case)
        c_acora = builder.build()
        setup_ca = time() - t
    if run_re:
        t = time()
        if hasattr(keywords[0], 'encode'):  # unicode in Py3?
            kw_regexp = '|'.join(keywords)
        else:
            kw_regexp = '|'.encode('ASCII').join(keywords)
        if ignore_case:
            regexp = re.compile(kw_regexp, re.I)
        else:
            regexp = re.compile(kw_regexp)
        setup_re = time() - t
    print("Case %ssensitive %s\n- setup times: PA: %.4f, CA: %.4f, RE: %.4f" %
          (ignore_case and 'in' or '', builder.for_unicode and 'unicode'
           or 'bytes', setup_pya, setup_ca, setup_re))

    if run_pa:
        timings = timeit.Timer(partial(py_acora.findall,
                                       s)).repeat(number=REPEAT_COUNT)
        print("TIME(paS): %.3f" % min(timings))
    if run_ca:
        timings = timeit.Timer(partial(c_acora.findall,
                                       s)).repeat(number=REPEAT_COUNT)
        print("TIME(caS): %.3f" % min(timings))
    if filename:
        if run_pa:
            timings = timeit.Timer(partial(
                py_acora.filefindall, filename)).repeat(number=REPEAT_COUNT)
            print("TIME(paF): %.3f" % min(timings))
        if run_ca:
            timings = timeit.Timer(partial(
                c_acora.filefindall, filename)).repeat(number=REPEAT_COUNT)
            print("TIME(caF): %.3f" % min(timings))
    if run_re:
        timings = timeit.Timer(partial(regexp.findall,
                                       s)).repeat(number=REPEAT_COUNT)
        print("TIME(reS): %.3f" % min(timings))

    return (run_pa and py_acora.findall(s)
            or None, run_ca and c_acora.findall(s)
            or None, run_pa and (filename and py_acora.filefindall(filename))
            or None, run_ca and (filename and c_acora.filefindall(filename))
            or None, run_re and regexp.findall(s) or None)
Ejemplo n.º 3
0
def setup(vregions_file, jregions_file):

    v_end_length = 40  # how many nts at the end of the V region to consider
    j_start_length = 40  # how many nts at the start of the J region to consider

    handle = open(vregions_file, 'r')
    v_list = list(SeqIO.parse(handle, 'fasta'))
    handle.close()
    v_genes = [str(string.upper(v.seq)) for v in v_list]
    v_genes_cut = [v[-v_end_length:] for v in v_genes]

    all_v_substrings = []
    for v in v_genes_cut:
        all_v_substrings.append([
            v[i:i + n] for n in range(4,
                                      len(v) + 1)
            for i in range(len(v) - (n - 1))
        ])

    t0 = time.time()
    v_keyword_tries = []
    for v_substrings in all_v_substrings:
        v_builder = AcoraBuilder()
        for i in range(len(v_substrings)):
            v_builder.add(v_substrings[i])
        v_keyword_tries.append(v_builder.build())
    print 'V keyword tries built in', round(time.time() - t0, 2), 'seconds'

    handle = open(jregions_file, 'r')
    j_list = list(SeqIO.parse(handle, 'fasta'))
    handle.close()
    j_genes = [str(string.upper(j.seq)) for j in j_list]
    j_genes_cut = [j[:j_start_length] for j in j_genes]

    all_j_substrings = []
    for j in j_genes_cut:
        all_j_substrings.append([
            j[i:i + n] for n in range(4,
                                      len(j) + 1)
            for i in range(len(j) - (n - 1))
        ])

    t0 = time.time()
    j_keyword_tries = []
    for j_substrings in all_j_substrings:
        j_builder = AcoraBuilder()
        for i in range(len(j_substrings)):
            j_builder.add(j_substrings[i])
        j_keyword_tries.append(j_builder.build())
    print 'J keyword tries built in', round(time.time() - t0, 2), 'seconds'

    return v_keyword_tries, j_keyword_tries, v_genes, j_genes
Ejemplo n.º 4
0
    def __init__(self, term_index):
        self.term_index = term_index

        builder = AcoraBuilder()
        for text in term_index:
            builder.add(text)
        self.ac = builder.build()
Ejemplo n.º 5
0
    def match_lines(self, s, *keywords):
        '''
        Searching for the specific keywords
 
        @param s  The Filename.
        @param Keywords  The List which contains two keywords (index 0 - is primary key and index 1 is the parameter).
        
        @returns Lines where the keywords present.
        '''

        builder = AcoraBuilder('\r', '\n', *keywords)
        ac = builder.build()

        line_start = 0
        matches = False
        for kw, pos in ac.finditer(s):
            if kw in '\r\n':
                if matches:
                    yield s[line_start:pos]
                    matches = False
                line_start = pos + 1
            else:
                matches = True
        if matches:
            yield s[line_start:]
Ejemplo n.º 6
0
def build_keyword_tries(seqs):

    builder = AcoraBuilder()
    for i in range(0, len(seqs)):
        builder.add(str(seqs[i]))  # Add all V tags to keyword trie

    key = builder.build()
    return key
Ejemplo n.º 7
0
    def __init__(self):
        # 所有实体词集合
        self._ner_word_list = []

        # 实体词替换的名字
        self._ner_name = ""

        # AC模型的builder
        self._builder = AcoraBuilder()
Ejemplo n.º 8
0
    def __init__(self, keywords, vocab=None):
        from acora import AcoraBuilder
        builder = AcoraBuilder()
        #assert isinstance(keywords, (list,tuple))
        self.vocab = vocab
        for i in keywords:
            builder.add(i)

        #Generate the Acora search engine for the current keyword set:
        self.engine = builder.build()
Ejemplo n.º 9
0
    def _build(self):
        builder = AcoraBuilder()

        for idx, item in enumerate(self._regexes_or_assoc):

            #
            #   First we compile all regular expressions and save them to
            #   the re_cache.
            #
            if isinstance(item, tuple):
                regex = item[0]
                regex = regex.encode(DEFAULT_ENCODING)
                self._re_cache[regex] = re.compile(regex,
                                                   self._re_compile_flags)

                if regex in self._translator:
                    raise ValueError('Duplicated regex "%s"' % regex)

                self._translator[regex] = item[1:]
            elif isinstance(item, basestring):
                regex = item.encode(DEFAULT_ENCODING)
                self._re_cache[regex] = re.compile(regex,
                                                   self._re_compile_flags)
            else:
                raise ValueError('Can NOT build MultiRE with provided values.')

            #
            #   Now we extract the string literals (longer than hint_len only) from
            #   the regular expressions and populate the acora index
            #
            regex_hints = esmre.hints(regex)
            regex_keywords = esmre.shortlist(regex_hints)

            if not regex_keywords:
                self._regexes_with_no_keywords.append(regex)
                continue

            # Get the longest one
            regex_keyword = regex_keywords[0]

            if len(regex_keyword) <= self._hint_len:
                self._regexes_with_no_keywords.append(regex)
                continue

            # Add this keyword to the acora index, and also save a way to associate the
            # keyword with the regular expression
            regex_keyword = regex_keyword.lower()
            builder.add(regex_keyword)

            regexes_matching_keyword = self._keyword_to_re.get(
                regex_keyword, [])
            regexes_matching_keyword.append(regex)
            self._keyword_to_re[regex_keyword] = regexes_matching_keyword

        return builder.build()
Ejemplo n.º 10
0
 def __init__(self, use_unicode=True, ignore_case=False, titles=None):
     """
     :param use_unicode: whether to use `titles` as unicode or bytestrings
     :param ignore_case: if True ignore case in all matches
     :param titles: if given, overrides default `load_titles()` values
     """
     titles = titles if titles else load_titles()
     titles = (titles if use_unicode else
               (s.encode('ascii') for s in titles))
     builder = AcoraBuilder()
     builder.update(titles)
     self.ac = builder.build(ignore_case=ignore_case)
 def __init__(self, content: List[str], ignore_case: bool):
     """
     Acora matcher factory
     :param content: a list of items to search
     :param ignore_case: True to match any case
     :return: a built matcher
     """
     # start with a string in case content is empty
     # otherwise it builds a binary Acora matcher
     builder = AcoraBuilder("!@#$%%^&*")
     if len(content) > 0:
         builder.update(content)
     self.matcher = builder.build(ignore_case=ignore_case)
Ejemplo n.º 12
0
    def test_acora_python(self):
        builder = AcoraBuilder()
        builder.update([s for (s,) in SQL_ERRORS])
        ac = builder.build(acora=PyAcora)

        i = 0

        #
        # This takes around 9 seconds in my workstation.
        #
        for j in xrange(self.ITERATIONS):
            for _ in ac.finditer(HTTP_RESPONSE):
                i += 1

        self.assertEqual(i, self.ITERATIONS * 2)
Ejemplo n.º 13
0
    def __init__(self, keywords: Optional[Iterable[str]] = []):
        non_empty_keywords = []
        if keywords is not None:
            for w in keywords:
                if w.strip() != "":
                    non_empty_keywords.append(w)

        self._keywords = set(non_empty_keywords)

        if len(self._keywords) > 0:
            ac_builder = AcoraBuilder()
            ac_builder.update(keywords)
            self._finder = ac_builder.build()
        else:
            self._finder = None
Ejemplo n.º 14
0
Archivo: graph.py Proyecto: cjx3721/QA
	def directed_graph(self) :
		if not hasattr(self, "_directed_graph") :
			print "getting directed graph ..."
			
			graph = defaultdict(_dd_int)
			# Zhu: in my VM, build speed is about 1.4w entity / s
			ac = AcoraBuilder(*self.database.entities).build()
			
			# match consumes no time, compared to build
			for text, attrib in self.database :
				entities = zip(*longest_match(ac.finditer(text)))[0]
				for entity in set(entities) :
					if entity == attrib["title"] :
						continue
					graph[attrib["title"]][entity] += 1
			
			delattr(self, "database")
			self._directed_graph = graph
			
		return self._directed_graph
Ejemplo n.º 15
0
    def _build(self):
        builder = AcoraBuilder()

        for idx, item in enumerate(self._keywords_or_assoc):

            if isinstance(item, tuple):
                keyword = item[0]
                keyword = keyword.encode(DEFAULT_ENCODING)

                if keyword in self._translator:
                    raise ValueError('Duplicated keyword "%s"' % keyword)

                self._translator[keyword] = item[1:]

                builder.add(keyword)
            elif isinstance(item, basestring):
                keyword = item.encode(DEFAULT_ENCODING)
                builder.add(keyword)
            else:
                raise ValueError('Can NOT build MultiIn with provided values.')

        return builder.build()
def import_tcr_info(inputargs):
    """ import_tcr_info: Gathers the required TCR chain information for Decombining """

    # Get chain information
    global chain

    chain = get_chain(inputargs)

    #################################################
    ############# GET GENES, BUILD TRIE #############
    #################################################

    print 'Importing TCR', ", ".join(map(chainnams.__getitem__,
                                         chain)), 'gene sequences...'

    # First check that valid tag/species combinations have been used
    if inputargs['tags'] == "extended" and inputargs['species'] == "mouse":
        print "Please note that there is currently no extended tag set for mouse TCR genes.\n \
    Decombinator will now switch the tag set in use from \'extended\' to \'original\'.\n \
    In future, consider editing the script to change the default, or use the appropriate flags (-sp mouse -tg original)."

        inputargs['tags'] = "original"

    if inputargs['tags'] == "extended" and ('g' in chain or 'd' in chain):

        print "Please note that there is currently no extended tag set for gamma/delta TCR genes.\n \
    Decombinator will now switch the tag set in use from \'extended\' to \'original\' for these chains.\n \
    In future, consider editing the script to change the default, or use the appropriate flags."

        inputargs['tags'] = "original"

    # Set tag split position, and check tag set. Note that original tags use shorter length J half tags, as these tags were originally shorter.

    global v_half_split, j_half_split
    if inputargs['tags'] == "extended":
        v_half_split, j_half_split = [10, 10]
    elif inputargs['tags'] == "original":
        v_half_split, j_half_split = [10, 6]
    else:
        print "Tag set unrecognised; should be either \'extended\' or \'original\' for human, or just \'original\' for mouse. \n \
    Please check tag set and species flag."

        sys.exit()

    # Check species information
    if inputargs['species'] not in ["human", "mouse"]:
        print "Species not recognised. Please select either \'human\' (default) or \'mouse\'.\n \
    If mouse is required by default, consider changing the default value in the script."

        sys.exit()

    # Look for tag and V/J fasta and tag files: if these cannot be found in the working directory, source them from GitHub repositories
    # Note that fasta/tag files fit the pattern "species_tagset_gene.[fasta/tags]"
    # I.e. "[human/mouse]_[extended/original]_TR[A/B/G/D][V/J].[fasta/tags]"

    chain_order = []

    for gene in ['v', 'j']:

        # Get FASTA data
        fasta_holder = []

        for i in range(len(chain)):
            fasta_file = read_tcr_file(inputargs['species'], inputargs['tags'],
                                       chain[i], gene, "fasta",
                                       inputargs['tagfastadir'])
            fasta_holder.append(list(SeqIO.parse(fasta_file, "fasta")))
            fasta_file.close()
            chain
        globals()[gene + "_genes"] = flatten(fasta_holder)

        globals()[gene + "_regions"] = []
        for g in range(0, len(globals()[gene + "_genes"])):
            globals()[gene + "_regions"].append(
                string.upper(globals()[gene + "_genes"][g].seq))

        # Get tag data

        gene_seq_holder = []  #initialise arrays
        half1_gene_seq_holder = []
        half2_gene_seq_holder = []
        jumpfunction_holder = []

        for i in range(len(chain)):
            tag_file = read_tcr_file(inputargs['species'], inputargs['tags'],
                                     chain[i], gene, "tags",
                                     inputargs['tagfastadir'])  # get tag data
            if gene == 'v': jumpfunction = "jump_to_end_v"
            elif gene == 'j': jumpfunction = "jump_to_start_j"
            tag_info_holder = globals()["get_" + gene + "_tags"](
                tag_file, globals()[gene + "_half_split"])
            gene_seq_holder.append(tag_info_holder[0])
            half1_gene_seq_holder.append(tag_info_holder[1])
            half2_gene_seq_holder.append(tag_info_holder[2])
            jumpfunction_holder.append(tag_info_holder[3])
            chain_order.append([chain[i], gene, len(gene_seq_holder[i])])
            tag_file.close()

        globals()[gene + "_seqs"] = flatten(gene_seq_holder)
        globals()["half1_" + gene + "_seqs"] = flatten(half1_gene_seq_holder)
        globals()["half2_" + gene + "_seqs"] = flatten(half2_gene_seq_holder)
        globals()[jumpfunction] = flatten(jumpfunction_holder)

        # Build Aho-Corasick tries
        globals()[gene + "_builder"] = AcoraBuilder()
        for i in range(0, len(globals()[gene + "_seqs"])):
            globals()[gene + "_builder"].add(str(
                globals()[gene +
                          "_seqs"][i]))  # Add all V tags to keyword trie
        globals()[gene + "_key"] = globals()[gene + "_builder"].build()

        # And tries for split, half-tags
        globals()[gene + "_half1_builder"] = AcoraBuilder()
        for i in range(0, len(globals()["half1_" + gene + "_seqs"])):
            globals()[gene + "_half1_builder"].add(
                str(globals()["half1_" + gene + "_seqs"][i]))
        globals()["half1_" + gene +
                  "_key"] = globals()[gene + "_half1_builder"].build()

        globals()[gene + "_half2_builder"] = AcoraBuilder()
        for i in range(0, len(globals()["half2_" + gene + "_seqs"])):
            globals()[gene + "_half2_builder"].add(
                str(globals()["half2_" + gene + "_seqs"][i]))
        globals()["half2_" + gene +
                  "_key"] = globals()[gene + "_half2_builder"].build()

    return chain_order
Ejemplo n.º 17
0
zy = {'00': 1,
      '01': 1,
      '02': 1,
      '03': 1,
      '10': 1,
      '11': 1,
      '20': 1,
      '22': 1,
      '30': 1,
      '33': 1}

zy = {i: np.log(zy[i]) for i in zy.keys()}

from acora import AcoraBuilder
views = pd.read_csv('View.csv', delimiter='\t', encoding='utf-8')['View']
views = AcoraBuilder(*views)
views = views.build()


def predict(i, data):
    y_pred = data.loc[i, 'predict']
    s = data.loc[i, 'Content'][:maxlen]
    nodes = [dict(zip(['0', '1', '2', '3'], k))
             for k in np.log(y_pred[:len(s)])]
    tags_pred_1 = viterbi(nodes)
    for j in views.finditer(s):
        for k in range(j[1], j[1] + len(j[0])):
            nodes[k]['1'] += 100
            nodes[k]['2'] += 100
            nodes[k]['3'] += 100
        try:
Ejemplo n.º 18
0
v_nams = []
for v in range(0, len(v_genes)):
  v_regions.append(str(v_genes[v].seq).upper())
  v_nams.append(v_genes[v].id.split("|")[1])

j_regions = []
j_nams = [] 
for j in range(0, len(j_genes)):
  j_regions.append(str(j_genes[j].seq).upper())
  j_nams.append(v_genes[v].id.split("|")[1])

## Build keyword tries of V and J tags for fast assignment
v_seqs, half1_v_seqs, half2_v_seqs, jump_to_end_v = get_v_tags(open("tags_tr"+ chain.lower() + "v.txt", "rU"), v_half_split)
j_seqs, half1_j_seqs, half2_j_seqs, jump_to_start_j = get_j_tags(open("tags_tr"+ chain.lower() + "j.txt", "rU"), j_half_split)

v_builder = AcoraBuilder()
for i in range(0,len(v_seqs)):
    v_builder.add(str(v_seqs[i])) # Add all V tags to keyword trie

v_key = v_builder.build()

j_builder = AcoraBuilder()
for i in range(0,len(j_seqs)):
    j_builder.add(str(j_seqs[i])) # Add all J tags to keyword trie

j_key = j_builder.build()

## Build keyword tries for first and second halves of both V and J tags
v_half1_builder = AcoraBuilder()
for i in range(0,len(half1_v_seqs)):
    v_half1_builder.add(str(half1_v_seqs[i]))
Ejemplo n.º 19
0
 def __init__(self, text):
     self.text = text
     keywords = ["ownership", "owner", "own", "propietary", "tracking", "track", "store", "keep", "keeping"]
     builder = AcoraBuilder()
     builder.add(*keywords)
     self.finder = builder.build()
def import_tcr_info(inputargs):
    """ import_tcr_info: Gathers the required TCR chain information for Decombining """

    # Get chain information
    global chainnams, chain, counts
    counts = coll.Counter()
    chainnams = {"a": "alpha", "b": "beta", "g": "gamma", "d": "delta"}

    # Detect whether chain specified in filename
    inner_filename_chains = [
        x for x in chainnams.values() if x in inputargs['fastq'].lower()
    ]
    if len(inner_filename_chains) == 1:
        counts['chain_detected'] = 1

    if inputargs['chain']:
        if inputargs['chain'].upper() in ['A', 'ALPHA', 'TRA', 'TCRA']:
            chain = "a"
        elif inputargs['chain'].upper() in ['B', 'BETA', 'TRB', 'TCRB']:
            chain = "b"
        elif inputargs['chain'].upper() in ['G', 'GAMMA', 'TRG', 'TCRG']:
            chain = "g"
        elif inputargs['chain'].upper() in ['D', 'DELTA', 'TRD', 'TCRD']:
            chain = "d"
        else:
            print(nochain_error)
            sys.exit()
    else:

        # If no chain provided, try and infer from filename
        if counts['chain_detected'] == 1:
            chain = inner_filename_chains[0][0]

        else:
            nochain_error = "TCR chain not recognised. \n \
      Please either include (one) chain name in the file name (i.e. alpha/beta/gamma/delta),\n \
      or use the \'-c\' flag with an explicit chain option (a/b/g/d, case-insensitive)."

            print(nochain_error)
            sys.exit()

    #################################################
    ############# GET GENES, BUILD TRIE #############
    #################################################

    print('Importing TCR', chainnams[chain], 'gene sequences...')

    # First check that valid tag/species combinations have been used
    if inputargs['tags'] == "extended" and inputargs['species'] == "mouse":
        print(
            "Please note that there is currently no extended tag set for mouse TCR genes.\n \
    Decombinator will now switch the tag set in use from \'extended\' to \'original\'.\n \
    In future, consider editing the script to change the default, or use the appropriate flags (-sp mouse -tg original)."
        )
        inputargs['tags'] = "original"

    if inputargs['tags'] == "extended" and (chain == 'g' or chain == 'd'):
        print(
            "Please note that there is currently no extended tag set for gamma/delta TCR genes.\n \
    Decombinator will now switch the tag set in use from \'extended\' to \'original\'.\n \
    In future, consider editing the script to change the default, or use the appropriate flags."
        )
        inputargs['tags'] = "original"

    # Set tag split position, and check tag set. Note that original tags use shorter length J half tags, as these tags were originally shorter.
    global v_half_split, j_half_split
    if inputargs['tags'] == "extended":
        v_half_split, j_half_split = [10, 10]
    elif inputargs['tags'] == "original":
        v_half_split, j_half_split = [10, 6]
    else:
        print(
            "Tag set unrecognised; should be either \'extended\' or \'original\' for human, or just \'original\' for mouse. \n \
    Please check tag set and species flag.")
        sys.exit()

    # Check species information
    if inputargs['species'] not in ["human", "mouse"]:
        print(
            "Species not recognised. Please select either \'human\' (default) or \'mouse\'.\n \
    If mouse is required by default, consider changing the default value in the script."
        )
        sys.exit()

    # Look for tag and V/J fasta and tag files: if these cannot be found in the working directory, source them from GitHub repositories
    # Note that fasta/tag files fit the pattern "species_tagset_gene.[fasta/tags]"
    # I.e. "[human/mouse]_[extended/original]_TR[A/B/G/D][V/J].[fasta/tags]"

    for gene in ['v', 'j']:
        # Get FASTA data
        fasta_file = read_tcr_file(inputargs['species'], inputargs['tags'],
                                   gene, "fasta", inputargs['tagfastadir'])
        globals()[gene + "_genes"] = list(SeqIO.parse(fasta_file, "fasta"))

        globals()[gene + "_regions"] = []
        for g in range(0, len(globals()[gene + "_genes"])):
            globals()[gene + "_regions"].append(
                globals()[gene + "_genes"][g].seq.upper())

        # Get tag data
        tag_file = read_tcr_file(inputargs['species'], inputargs['tags'], gene,
                                 "tags",
                                 inputargs['tagfastadir'])  # get tag data
        tag_data = open(tag_file, "r")
        if gene == 'v': jumpfunction = "jump_to_end_v"
        elif gene == 'j': jumpfunction = "jump_to_start_j"
        globals()[gene+"_seqs"], globals()["half1_"+gene+"_seqs"], globals()["half2_"+gene+"_seqs"], globals()[jumpfunction] = \
          globals()["get_"+gene+"_tags"](tag_data, globals()[gene+"_half_split"])
        tag_data.close()

        # Build Aho-Corasick tries
        globals()[gene + "_builder"] = AcoraBuilder()
        for i in range(0, len(globals()[gene + "_seqs"])):
            globals()[gene + "_builder"].add(str(
                globals()[gene +
                          "_seqs"][i]))  # Add all V tags to keyword trie
        globals()[gene + "_key"] = globals()[gene + "_builder"].build()

        # And tries for split, half-tags
        globals()[gene + "_half1_builder"] = AcoraBuilder()
        for i in range(0, len(globals()["half1_" + gene + "_seqs"])):
            globals()[gene + "_half1_builder"].add(
                str(globals()["half1_" + gene + "_seqs"][i]))
        globals()["half1_" + gene +
                  "_key"] = globals()[gene + "_half1_builder"].build()

        globals()[gene + "_half2_builder"] = AcoraBuilder()
        for i in range(0, len(globals()["half2_" + gene + "_seqs"])):
            globals()[gene + "_half2_builder"].add(
                str(globals()["half2_" + gene + "_seqs"][i]))
        globals()["half2_" + gene +
                  "_key"] = globals()[gene + "_half2_builder"].build()
Ejemplo n.º 21
0
import json
import linecache
import os
import re

import jieba
import numpy as np
from acora import AcoraBuilder

from emotion_cla.emo_cls import classify
from emotion_cla.separate import separate

in_dir = 'data/tweet'
out_dir = 'data/tweet_emo'
builder = AcoraBuilder([line.strip() for line in open('data/emoji.txt')])
ac = builder.build()


def load_labelled():
    lines = set()
    for i in range(5):
        for line in open('data/content_3000/{}.txt'.format(i)):
            lines.add(line.strip())
    return lines


# have_lines = load_labelled()


def random_ids(in_name, out_name, lens):
    '''
    for key, values in output_dict.items():  # remove last ", "
        output_dict[key] = values[:-2]

    return output_dict


if __name__ == "__main__":
    args = parsing_argument()

    if not args.source:
        raise Exception("Please input the source file")
    with open(args.source, 'r') as file:
        keywords = file.read().splitlines()  # Reading the source file

    ac = AcoraBuilder(keywords)
    ac = ac.build()  # build the model for searching the keywords

    # Reading the target files
    if args.target_files:
        with open(args.target_files, 'r') as file:
            target_files = file.read().splitlines()
            target_file = [
                target_file for target_file in target_files
                if ".pdf" in target_file or ".html" in target_file
            ]
    else:
        target_files = [
            os.path.join(paths, file)
            for paths, _, files in os.walk(args.target_folder)
            for file in files if '.pdf' in file or '.html' in file
    mouse_proteome_file = [
        x for x in os.listdir(fxn.base_data_dir) if '_mouse.fasta' in x
    ][0]

    mouse_proteins = coll.defaultdict()
    with gzip.open(fxn.base_data_dir + mouse_proteome_file, 'rU') as in_file:
        for protein, seq, blank in fxn.read_fa(in_file):
            mouse_proteins[protein.split(' ')[0]] = seq

    # Then scroll through non-predicted binder files, build an AC trie of all the peptides per file
    data_dir = '../Data/NonPredictedBinders/'
    matches = coll.defaultdict(fxn.nest_counter)
    all_peptides = coll.defaultdict(list)
    for f in [x for x in os.listdir(data_dir) if x.endswith('.txt')]:
        nam = f.split('-')[0]
        search_builder = AcoraBuilder()
        peptides = []

        # Build trie
        with open(data_dir + f, 'rU') as in_file:
            for line in in_file:
                search_builder.add(line.rstrip())
                peptides.append(line.rstrip())
                all_peptides[f.split('-')[0]].append(line.rstrip())
        seq_search = search_builder.build()

        # Use to search all proteins in proteome
        for protein in mouse_proteins:
            seq_check = seq_search.findall(mouse_proteins[protein])
            if seq_check:
                for s in seq_check: