Ejemplo n.º 1
0
    def test_complement(self):

        genome_file = make_file_from_list(
            [
                ['1', '2000'],
                ['2', '1000'],
                ['MT', '500'],
            ], bedtool=False)

        genes = list_to_intervals([
            ['1', '.', 'gene1', '200', '400', '.', '+', '.', '.'],
            ['1', '.', 'gene2', '300', '600', '.', '+', '.', '.'],
            ['1', '.', 'gene3', '200', '500', '.', '+', '.', '.'],
            ['2', '.', 'gene4', '100', '200', '.', '+', '.', '.'],
            ['2', '.', 'gene5', '100', '300', '.', '-', '.', '.'],
        ])

        complement = make_list_from_file(segment._complement(genes, genome_file, '+'), fields_separator='\t')

        empty_col8 = 'ID "inter%s"; gene_id "."; transcript_id ".";'
        expected = [
            ['1', '.', 'intergenic', '1', '199', '.', '+', '.', empty_col8 % "P00000"],
            ['1', '.', 'intergenic', '601', '2000', '.', '+', '.', empty_col8 % "P00001"],
            ['2', '.', 'intergenic', '1', '99', '.', '+', '.', empty_col8 % "P00002"],
            ['2', '.', 'intergenic', '201', '1000', '.', '+', '.', empty_col8 % "P00003"],
            ['MT', '.', 'intergenic', '1', '500', '.', '+', '.', empty_col8 % "P00004"],
        ]

        self.assertEqual(complement, expected)
Ejemplo n.º 2
0
def make_types_length_file(annotation,
                           fai,
                           out_file=None,
                           subtype='biotype',
                           excluded_types=None):
    """
    Calculate the number of non-overlapping base pairs of each "type".

    In the context of this function "type" equals to the combination of 3rd
    column and attribute subtype from annotation file (GTF).


    Parameters
    ----------
    annotation : str
        Path to annotation GTF file (should include subtype attribute).
    out_file : str
        Path to output file (if None, name is set automatically).
    subtype : int
        Subtype.
    excluded_types : list_str
        Types listed in 3rd column of GTF to be exclude from analysis.

    Returns
    -------
    str
        Absolute path to out_file.

    """
    excluded_types = excluded_types or []
    ann_filtered = pybedtools.BedTool(annotation).filter(
        lambda x: x[2] not in excluded_types).sort().saveas()

    fai = _first_two_columns(fai)

    if out_file is None:
        match = re.match(r'([\w_]+\.\d+.).*', os.path.basename(annotation))
        if match:
            out_file = str(match.group(1)) + 'types_length.txt'
        else:
            out_file = annotation + 'types_length.txt'

    # Merge all annotation and take what is left = unannotated regions:
    unann_pos = _complement(ann_filtered.fn, fai, '+', type_name='unannotated')
    unann_neg = _complement(ann_filtered.fn, fai, '-', type_name='unannotated')
    ann_modified_file = out_file + 'modified'
    ann_joined = ann_filtered.cat(unann_pos, unann_neg, postmerge=False). \
        sort().saveas(ann_modified_file)

    data = {}
    for interval in ann_joined:
        if subtype:
            sbtyp = interval.attrs.get(subtype, None)
            type_ = ' '.join([interval[2], sbtyp] if sbtyp else [interval[2]])
        else:
            type_ = interval[2]
        if type_ not in data:
            data[type_] = []
        data[type_].append(create_interval_from_list(interval.fields))

    type_lengths = {}
    for type_, list_ in data.items():
        total_bp = 0
        # pylint: disable=unexpected-keyword-arg
        for feature in pybedtools.BedTool(i for i in list_).merge(s=True):
            total_bp += len(feature)
        type_lengths[type_] = total_bp

    # Write results to file:
    with open(out_file, 'wt') as outfile:
        for type_, length in sorted(type_lengths.items()):
            outfile.write('{}\t{}\n'.format(type_, length))

    return os.path.abspath(out_file), os.path.abspath(ann_modified_file)