Beispiel #1
0
 def load_names(self):
     """Load names from file."""
     filename = os.path.abspath(os.path.join(self.directory, 'names.dmp'))
     for line in file_io.stream_file(filename):
         row = self.parse_taxdump_row(line)
         if row and row[3] == 'scientific name':
             self.names[int(row[0])] = row[1]
Beispiel #2
0
def parse_blast(blast_file, results=None, index=0):
    """Parse file into dict of lists."""
    if results is None:
        results = defaultdict(list)
    for line in file_io.stream_file(blast_file):
        row = line.rstrip().split('\t')
        seq_id, *offset = row[0].split('_-_')
        offset = int(offset[0]) if offset else 0
        try:
            hit = {
                'subject': row[4],
                'score': float(row[2]),
                'start': int(row[9]) + offset,
                'end': int(row[10]) + offset,
                'file': index
            }
        except IndexError:
            hit = {
                'subject': row[3],
                'score': float(row[2]),
                'start': None,
                'end': None,
                'file': index
            }
        try:
            hit.update({'taxid': int(row[1])})
        except ValueError:
            hit.update({'taxid': 0})
        results[seq_id].append(hit)
    return results
Beispiel #3
0
 def load_ranks(self):
     """Load ranks from file."""
     filename = os.path.abspath(os.path.join(self.directory, 'nodes.dmp'))
     try:
         for line in file_io.stream_file(filename):
             row = self.parse_taxdump_row(line)
             if len(row) > 1:
                 self.ranks[int(row[0])] = row[2]
     except TypeError:
         print("ERROR: Unable to parse %s." % filename)
         exit(1)
Beispiel #4
0
def parse_synonyms(synonym_file, delimiter, columns, header, identifiers):
    """Parse synonyms into Array."""
    meta = {}
    synonym_file, *prefix = synonym_file.split("=")
    if prefix:
        prefix = prefix[0]
    else:
        prefix = Path(synonym_file).stem
    meta["field_id"] = "%s_synonyms" % prefix
    by_id = {}
    ids = identifiers.to_set()
    data = file_io.stream_file(synonym_file)
    lines = [line for line in data]
    if columns:
        columns = columns.split(",")
    else:
        columns = []
    delimit = set_delimiter(delimiter, sample=lines[0])
    if header:
        header_row = lines[0].rstrip().replace('"', "")
        columns = parse_header_row(delimit, header_row, columns)
        lines = lines[1:]
    try:
        id_col = columns.index("identifier")
    except ValueError:
        id_col = None
    for line in lines:
        row = re.split(delimit, line.rstrip().replace('"', ""))
        key = None
        names = []
        for i, value in enumerate(row):
            if id_col is None and value in ids:
                key = value
                id_col = i
            elif i == id_col:
                key = value
            else:
                names.append(value)
        by_id.update({key: names})
    values = [by_id[id] if id in by_id else [] for id in identifiers.values]
    del columns[id_col]
    synonyms_field = Array(
        meta["field_id"],
        meta=meta,
        values=values,
        headers=columns,
        parents=["children"],
    )
    return synonyms_field
Beispiel #5
0
def parse_blast(blast_file, cols, results=None, index=0, evalue=1, bitscore=1):
    """Parse file into dict of lists."""
    if results is None:
        results = defaultdict(list)
    for line in file_io.stream_file(blast_file):
        row = line.rstrip().split('\t')
        score = float(row[cols['bitscore']])
        if score < bitscore:
            continue
        if len(row) == 4:
            cols['sseqid'] = 3
        else:
            if evalue < float(row[cols['evalue']]):
                continue
        seq_id, *offset = row[cols['qseqid']].split('_-_')
        offset = int(offset[0]) if offset else 0
        try:
            hit = {
                'subject': row[cols['sseqid']],
                'score': score,
                'start': int(row[cols['sstart']]) + offset,
                'end': int(row[cols['send']]) + offset,
                'file': index
            }
        except IndexError:
            hit = {
                'subject': row[cols['sseqid']],
                'score': score,
                'start': None,
                'end': None,
                'file': index
            }
        try:
            taxid = row[cols['staxids']]
            try:
                taxid, *rest = taxid.split(';')
            except ValueError:
                pass
            hit.update({'taxid': int(taxid)})
        except ValueError:
            hit.update({'taxid': 0})
        results[seq_id].append(hit)
    return results
Beispiel #6
0
 def load_ancestors(self):
     """Load ancestors from file."""
     filename = os.path.abspath(os.path.join(self.directory, 'taxidlineage.dmp'))
     for line in file_io.stream_file(filename):
         row = self.parse_taxdump_row(line)
         if row[1]:
             taxid = int(row[0])
             self.ancestors[taxid] = {
                 self.ranks[int(id)]: int(id)
                 for id in row[1].split(' ')
                 if self.ranks[int(id)] in self.list_ranks()
                 }
             if self.ranks[taxid] in self.list_ranks():
                 self.ancestors[taxid].update({self.ranks[taxid]: taxid})
             last = 0
             for rank in self.list_ranks():
                 if rank in self.ancestors[taxid]:
                     last = -self.ancestors[taxid][rank]
                 else:
                     self.ancestors[taxid].update({rank: last})
Beispiel #7
0
def parse_synonyms(synonym_file, identifiers):
    """Parse synonyms into Array."""
    meta = {}
    file_stem = Path(synonym_file).stem
    meta['field_id'] = "%s_synonyms" % file_stem
    by_id = {}
    ids = identifiers.to_set()
    for line in file_io.stream_file(synonym_file):
        row = line.rstrip().replace('"', '').split('\t')
        key = None
        names = []
        for value in row:
            if value in ids:
                key = value
            else:
                names.append(value)
        by_id.update({key: names})
    values = [by_id[id] if id in by_id else [] for id in identifiers.values]
    synonyms_field = Array(meta['field_id'],
                           meta=meta,
                           values=values,
                           parents=['children'])
    return synonyms_field
Beispiel #8
0
def parse_blast(blast_file, cols, results=None, index=0, evalue=1, bitscore=1):
    """Parse file into dict of lists."""
    if results is None:
        results = defaultdict(list)
    bitscores = {}
    blastp = {}

    for line in file_io.stream_file(blast_file):
        row = line.rstrip().split("\t")
        score = float(row[cols["bitscore"]])
        if score < bitscore:
            continue
        if len(row) == 4:
            cols["sseqid"] = 3
        else:
            if evalue < float(row[cols["evalue"]]):
                continue
        # allow for mis-specified columns following documentation bug
        if "sstart" in cols and "qstart" not in cols:
            cols["qstart"] = cols["sstart"]
        if "send" in cols and "qend" not in cols:
            cols["qend"] = cols["send"]
        seq_id, *offset = row[cols["qseqid"]].split("_-_")
        offset = int(offset[0]) if offset else 0
        query = row[cols["qseqid"]]
        if ":" in query and "=" in query:
            # parse blastp
            parts = query.split("=")
            if query in bitscores and score <= bitscores[query]:
                continue
            if len(parts) == 3 and parts[2] == "fragmented":
                continue
            bitscores[query] = score
            seq_id, start, end = re.split(r"[:-]", parts[0])
            hit = {
                "subject": row[cols["sseqid"]],
                "score": score,
                "start": int(start),
                "end": int(end),
                "file": index,
                "title": parts[1],
            }
        else:
            # parse blastx/blastn
            try:
                hit = {
                    "subject": row[cols["sseqid"]],
                    "score": score,
                    "start": int(row[cols["qstart"]]) + offset,
                    "end": int(row[cols["qend"]]) + offset,
                    "file": index,
                }
            except IndexError:
                # parse file without positions
                hit = {
                    "subject": row[cols["sseqid"]],
                    "score": score,
                    "start": None,
                    "end": None,
                    "file": index,
                }
        try:
            taxid = row[cols["staxids"]]
            try:
                taxid, *rest = taxid.split(";")
            except ValueError:
                # no taxid for this row
                pass
            hit.update({"taxid": int(taxid)})
        except ValueError:
            # no taxid in file
            hit.update({"taxid": 0})
        if bitscores:
            blastp[query] = hit
        else:
            results[seq_id].append(hit)
    if bitscores:
        for query, hit in blastp.items():
            seq_id, rest = query.split(":")
            results[seq_id].append(hit)
    return results