예제 #1
0
 def test_updateFeatureGff(self):
     iesgff = SharedFunctions.Gff()
     iesgff.list2gff(TestInsert.gfflist)
     ins = Insert.Insert(TestInsert.ref, iesgff, TestInsert.ies)
     ins._filterInserts()
     ins._updatePositionsInserts()
     feats = [i.split("\t") for i in TestInsert.oldfeatures]
     newgff = ins.updateFeatureGff(feats, addsuffix=True)
     self.assertEqual(
         [[str(elem) for elem in i] for i in newgff if re.match('ID=gene1.', i[8])],
         [['ctg1','.','gene','3','5','.','.','.','ID=gene1.seg_0;key1=attr1;key2=attr2'],
          ['ctg1','.','gene','10','11','.','.','.','ID=gene1.seg_1;key1=attr1;key2=attr2']])
     self.assertEqual(
         [[str(elem) for elem in i] for i in newgff if re.match('ID=gene2', i[8])],
         [['ctg1','.','gene','21','24','.','.','.','ID=gene2']])
     # Test multi-segment feature spanning several lines
     self.assertEqual(
         [[str(elem) for elem in i] for i in newgff if re.match('ID=cds1', i[8])],
         [['ctg1','.','CDS','3','5','.','+','0','ID=cds1.seg_0;key1=attr1;key2=attr2'],
          ['ctg1','.','CDS','10','11','.','+','0','ID=cds1.seg_1;key1=attr1;key2=attr2'],
          ['ctg1','.','CDS','21','24','.','+','0','ID=cds1']])
     # Test feature without added suffix
     newgff_nosuffix = ins.updateFeatureGff(feats, addsuffix=False)
     self.assertEqual(
         [[str(elem) for elem in i] for i in newgff_nosuffix if re.match('ID=gene1;', i[8])],
         [['ctg1','.','gene','3','5','.','.','.','ID=gene1;key1=attr1;key2=attr2'],
          ['ctg1','.','gene','10','11','.','.','.','ID=gene1;key1=attr1;key2=attr2']])
예제 #2
0
 def test_reportInsertedReference(self):
     gff = SharedFunctions.Gff()
     gff.list2gff(TestInsert.gfflist)
     ins = Insert.Insert(TestInsert.ref, gff, TestInsert.ies)
     newfasta, newgff = ins.reportInsertedReference()
     self.assertEqual(
         str(newfasta['ctg1'].seq),
         'AAAAATTTTAAAAGGGGGAAAAAAAAAAA')
     self.assertEqual(
         str(newfasta['ctg2'].seq),
         'GGGGGGGGGCCCCCGGGGGGGGGGG')
예제 #3
0
파일: main.py 프로젝트: Swart-lab/bleties
def insert(args):
    logger = logging.getLogger("main.insert")
    run_boilerplate(logger)
    logger.info("Started BleTIES Insert")

    logger.info("Reading input files")
    refgenome = SeqIO.to_dict(SeqIO.parse(args.ref, "fasta"))
    gff = SharedFunctions.Gff()
    gff.file2gff(args.ies)

    if re.match(r"ins", args.mode):
        ies = SeqIO.to_dict(SeqIO.parse(args.iesfasta, "fasta"))
        logger.info(
            "Inserting IESs to MAC reference to make MAC+IES hybrid reference")
        ins = Insert.Insert(refgenome, gff, ies)
        newrefgenome, newgff = ins.reportInsertedReference()
        if args.featuregff:
            logger.info(f"Updating coords in feature table {args.featuregff}")
            # Slurp file into memory as list of lists
            with open(args.featuregff, 'r') as fh:
                feats = [
                    line.rstrip().split("\t") for line in fh
                    if line[0] != '#' and len(line.rstrip().split("\t")) == 9
                ]
            # Update coordinates of feature table to account for inserted IESs
            newfeaturegff = ins.updateFeatureGff(feats, args.addsuffix)
            outfeaturegff = f"{args.out}.iesplus.feature_table.gff"
            logger.info(f"Writing new feature table to {outfeaturegff}")
            with open(outfeaturegff, 'w') as fh:
                fh.write("##gff-version 3\n")
                for line in newfeaturegff:
                    fh.write("\t".join([str(i) for i in line]))
                    fh.write("\n")
        outfasta = f"{args.out}.iesplus.fasta"
        outgff = f"{args.out}.iesplus.gff"
    elif re.match(r"del", args.mode):
        logger.info(
            f"Removing IESs from MAC+IES reference to make MAC-IES reference")
        dels = Insert.Insert(refgenome, gff, None)
        newrefgenome, newgff = dels.reportDeletedReference()
        outfasta = f"{args.out}.iesminus.fasta"
        outgff = f"{args.out}.iesminus.gff"

    logger.info(f"Writing output files to {outfasta}, {outgff}")
    with open(outfasta, "w") as fh:
        SeqIO.write(list(newrefgenome.values()), fh, "fasta")
    newgff.gff2file(outgff, header=True)
    logger.info("Finished Insert")
예제 #4
0
 def test_updateFeatureGff_tapointer(self):
     iesgff = SharedFunctions.Gff()
     iesgff.list2gff(TestInsert.gfflist)
     ins = Insert.Insert(TestInsert.ref, iesgff, TestInsert.ies)
     ins._filterInserts()
     ins._updatePositionsInserts()
     ins._updatePointerPositionsInserts()
     ins._addSequences()
     # print("\n".join(ins._newgff.gff2list())) # testing
     self.assertEqual(str(ins._newgff.getValue('ies6', 'start')), '15')
     self.assertEqual(str(ins._newgff.getValue('ies6', 'end')), '19')
     self.assertEqual(str(ins._newgff.getAttr('ies6', 'ta_pointer_start')), '16')
     self.assertEqual(str(ins._newgff.getAttr('ies6', 'ta_pointer_end')), '20')
     self.assertEqual(str(ins._newgff.getValue('ies7','start')), '11')
     self.assertEqual(str(ins._newgff.getValue('ies7','end')), '15')
     self.assertEqual(str(ins._newgff.getAttr('ies7','ta_pointer_start')), '11')
     self.assertEqual(str(ins._newgff.getAttr('ies7','ta_pointer_end')), '15')
예제 #5
0
 def test_reportDeletedReference(self):
     gff = SharedFunctions.Gff()
     gff.list2gff(TestInsert.gfflist)
     # Insert sequences into reference
     ins = Insert.Insert(TestInsert.ref, gff, TestInsert.ies)
     newfasta, newgff = ins.reportInsertedReference()
     # print("\n".join(newgff.gff2list())) # testing
     # Take them out again
     dels = Insert.Insert(newfasta, newgff, None)
     delfasta, delgff = dels.reportDeletedReference()
     # print("\n".join(delgff.gff2list())) # testing
     # Check that they are the same sequence
     self.assertEqual(
         str(delfasta['ctg1'].seq),
         str(TestInsert.ref['ctg1'].seq))
     self.assertEqual(
         str(delfasta['ctg2'].seq),
         str(TestInsert.ref['ctg2'].seq))
     self.assertEqual(
         str(delfasta['ctg3'].seq),
         str(TestInsert.ref['ctg3'].seq))
     self.assertEqual(str(delgff.getAttr('ies6', 'ta_pointer_start')), '10')
     self.assertEqual(str(delgff.getAttr('ies6', 'ta_pointer_end')), '10')
예제 #6
0
    type=int,
    help="Minimum length to show in histogram of IES lengths (detail)")
parser.add_argument(
    "--hist_len_max",
    default=400,
    type=int,
    help="Maximum length to show in histogram of IES lengths (detail)")
parser.add_argument(
    "--hist_style",
    default="facet",
    help="Style for histograms of IES lengths: 'facet', 'bar', or 'barstacked'"
)
args = parser.parse_args()

# Import GFF records
gff_obj = SharedFunctions.Gff()
gff_obj.file2gff(args.gff)

# Get relevant fields
out = []
for gff_id in gff_obj:
    row = {}
    row['id'] = gff_id
    if int(gff_obj.getValue(gff_id,
                            "start")) == int(gff_obj.getValue(gff_id, "end")):
        row['type'] = "ins"
    elif int(gff_obj.getValue(gff_id, "start")) < int(
            gff_obj.getValue(gff_id, "end")):
        row['type'] = "del"
    else:
        raise Exception(f"Invalid GFF3, start > end: check {row['id']}")
예제 #7
0
파일: main.py 프로젝트: Swart-lab/bleties
def miser(args):
    logger = logging.getLogger("main.miser")
    run_boilerplate(logger)
    logger.info("Started BleTIES MISER")

    # Read input files
    alnfile, refgenome = read_bam_ref(args.bam, args.ref)
    # Initialize new IesRecords object to store putative IESs
    iesrecords = Milraa.IesRecords(alnfile, "bam", refgenome)

    # Read in IES GFF file produced by Milraa
    logger.info("Reading GFF file containing putative IESs")
    iesgff = SharedFunctions.Gff()
    iesgff.file2gff(args.gff)

    # Compare mean mismatch percentage of reads with and without putative IES
    # for each putative IES
    logger.info("""
    Reporting possibly spurious IESs due to misassembly or mapped paralogs
    """)
    out_gff_split = defaultdict(
        list)  # dict to hold split GFF file keyed by diagnosis
    args.out.write("\t".join([
        'ID', 'mean_mismatch_pc_with_indel', 'mean_mismatch_pc_no_indel',
        'stdev_with_indel', 'stdev_no_indel', 'statistic', 'p-value',
        'num_reads_with_indel', 'num_reads_no_indel', 'diagnosis'
    ]))
    args.out.write("\n")
    for bpid in iesgff:
        ins_mm, non_mm = iesrecords.reportIndelReadMismatchPc(
            iesgff.getValue(bpid, 'seqid'), int(iesgff.getValue(bpid,
                                                                'start')),
            int(iesgff.getValue(bpid, 'end')),
            int(iesgff.getAttr(bpid, 'IES_length')), int(args.min_ies_length))
        # Perform test of mismatch % if more than 2 reads with inserts
        # (otherwise stdev meaningless)
        if len(ins_mm) > 2 and len(non_mm) > 2:
            if args.spurious_ies_test == 'mann-whitney':
                # Mann-Whitney U test for whether mismatch % with indel of interest
                # is greater than without
                mwstat, mwpval = mannwhitneyu(ins_mm,
                                              non_mm,
                                              alternative='greater')
            else:
                # Ward's t-test (non-equal population variances)
                mwstat, mwpval = ttest_ind(ins_mm, non_mm, equal_var=False)
            # Report
            outarr = [
                bpid,
                round(stats.mean(ins_mm), 2),
                round(stats.mean(non_mm), 2),
                round(stats.stdev(ins_mm), 2),
                round(stats.stdev(non_mm), 2),
                round(mwstat, 2),
                '%.2E' % mwpval,  # scientific notation
                len(ins_mm),
                len(non_mm)
            ]
            # Diagnosis
            diagnosis = "ok"
            if stats.mean(ins_mm) > 5 or stats.mean(non_mm) > 5:
                diagnosis = "high_error"
            if len(ins_mm) > len(non_mm):
                diagnosis = 'misassembly'
            # PVAL_UNCORR = 0.05 # TODO magic number
            pval_corr = args.spurious_ies_pvalue / \
                len(iesgff)  # Bonferroni correction
            if mwpval < pval_corr and stats.mean(ins_mm) > stats.mean(non_mm):
                diagnosis = "paralog"
        elif len(non_mm) < 1:
            outarr = [
                bpid,
                round(stats.mean(ins_mm), 2), "NA", "NA", "NA", "NA", "NA",
                len(ins_mm),
                len(non_mm)
            ]
            diagnosis = "misassembly"  # or scrambling
        else:
            outarr = [
                bpid,
                round(stats.mean(ins_mm), 2),
                round(stats.mean(non_mm), 2), "NA", "NA", "NA", "NA",
                len(ins_mm),
                len(non_mm)
            ]
            diagnosis = "low_coverage"
        outarr.append(diagnosis)
        # Split the input GFF entries into each diagnosis group
        out_gff_split[diagnosis].append(iesgff.getEntry(bpid))
        # Write output
        args.out.write("\t".join([str(i) for i in outarr]))
        args.out.write("\n")
        # args.out.write(" ".join([str(i) for i in ins_mm]) + "\n")
        # args.out.write(" ".join([str(i) for i in non_mm]) + "\n")

    # Output split GFF files
    if args.split_gff:
        logger.info("Splitting input GFF entries into inferred categories")
        for diag in out_gff_split:
            outfile = f"{args.gff}.{diag}.gff3"
            with open(outfile, "w") as fh_spl:
                # Write gff version header and comment some info on this file
                fh_spl.write("##gff-version 3\n")
                fh_spl.write("# " + " ".join(sys.argv) + "\n")
                fh_spl.write(
                    f"# BleTIES MISER putative IESs classified as {diag}\n")
                for line in out_gff_split[diag]:
                    fh_spl.write("\t".join([str(i) for i in line]))
                    fh_spl.write("\n")

    # Close alignment filehandle
    alnfile.close()
    logger.info("Finished MISER")