Exemple #1
0
 def get_edge_list():
     cn = barbell_graph(GC.barbell_m1, GC.barbell_m2)
     out = GC.nx2favites(cn, 'u')
     f = gopen(expanduser("%s/contact_network.txt.gz" % GC.out_dir), 'wb',
               9)
     f.write('\n'.join(out).encode())
     f.write(b'\n')
     f.close()
     GC.cn_communities = [
         {i
          for i in range(GC.barbell_m1)},
         {
             i
             for i in range(GC.barbell_m1 +
                            GC.barbell_m2, 2 * GC.barbell_m1 +
                            GC.barbell_m2)
         }
     ]  # only left and right communities, not the path
     f = gopen(
         expanduser("%s/contact_network_partitions.txt.gz" % GC.out_dir),
         'wb', 9)
     f.write(str(GC.cn_communities).encode())
     f.write(b'\n')
     f.close()
     GC.cn_communities = [{str(i) for i in c} for c in GC.cn_communities]
     return out
Exemple #2
0
 def flush_buffers(self):
     """Append to R1 and R2 fastq.gz files"""
     with gopen(self.R1_fastq_name, "ab") as fastq_file:
         fastq_file.write(self.R1_buffer)
         self.R1_buffer = ""
     with gopen(self.R2_fastq_name, "ab") as fastq_file:
         fastq_file.write(self.R2_buffer)
         self.R2_buffer = ""
 def introduce_sequencing_error(node):
     if not hasattr(GC, "sequencing_file"):
         if GC.art_454_amplicon_mode == "single":
             GC.sequencing_file = gopen(
                 '%s/error_prone_files/sequence_data_subsampled_errorprone.fastq.gz'
                 % GC.out_dir, 'wb', 9)
         else:
             GC.sequencing_file = gopen(
                 '%s/error_prone_files/sequence_data_subsampled_errorprone_read1.fastq.gz'
                 % GC.out_dir, 'wb', 9)
             GC.sequencing_file2 = gopen(
                 '%s/error_prone_files/sequence_data_subsampled_errorprone_read2.fastq.gz'
                 % GC.out_dir, 'wb', 9)
     orig_dir = getcwd()
     chdir(GC.out_dir)
     makedirs("ART_output", exist_ok=True)
     chdir("ART_output")
     cn_label = node.get_name()
     for t in GC.final_sequences[cn_label]:
         f = NamedTemporaryFile(mode='w')
         for l, s in GC.final_sequences[cn_label][t]:
             f.write(">%s\n%s\n" % (l, s))
         f.flush()
         command = [GC.art_454_path] + GC.art_454_options
         if GC.random_number_seed is not None:
             command += ['-r', str(GC.random_number_seed)]
             GC.random_number_seed += 1
         if GC.art_454_amplicon_mode == "single":
             command.append('-A')
         else:
             command.append('-B')
         command.append(f.name)
         command.append('%s_%f' % (cn_label, t))
         command.append(str(GC.art_454_reads_pairs_per_amplicon))
         try:
             call(command,
                  stdout=open('%s_%f.log' % (cn_label, t), 'w'),
                  stderr=STDOUT)
         except FileNotFoundError:
             chdir(GC.START_DIR)
             assert False, "art_454 executable was not found: %s" % GC.art_454_path
         f.close()
         if GC.art_454_amplicon_mode == "single":
             for l in open('%s_%f.fq' % (cn_label, t)):
                 GC.sequencing_file.write(l.encode())
         else:
             rename('%s_%f.fq' % (cn_label, t),
                    '%s_%f_read1.fq' % (cn_label, t))
             for l in open('%s_%f_read1.fq' % (cn_label, t)):
                 GC.sequencing_file.write(l.encode())
             rename('%s_%f2.fq' % (cn_label, t),
                    '%s_%f_read2.fq' % (cn_label, t))
             for l in open('%s_%f_read2.fq' % (cn_label, t)):
                 GC.sequencing_file2.write(l.encode())
     chdir(orig_dir)
Exemple #4
0
 def run(self):
     for key in ['reads_1', 'reads_2']:
         reads_in = self.reads.output()[key].local_path()
         target = self.output()[key]
         reads_out = target.local_path()
         with gopen(reads_in) as ifile, gopen(reads_out, 'w') as ofile:
             for i, line in enumerate(ifile):
                 if (i % 4) == 1:
                     line = line[:CLIP_LEN] + b'\n'
                 ofile.write(line)
         target.set_payload(reads_out)
         target.upload()
Exemple #5
0
 def get_edge_list():
     cn = relaxed_caveman_graph(GC.cave_num_cliques, GC.cave_clique_size, GC.cave_prob, seed=GC.random_number_seed)
     if GC.random_number_seed is not None:
         GC.random_number_seed += 1
     out = GC.nx2favites(cn, 'u')
     f = gopen(expanduser("%s/contact_network.txt.gz" % GC.out_dir),'wb',9)
     f.write('\n'.join(out).encode()); f.write(b'\n')
     f.close()
     GC.cn_communities = [{c*GC.cave_clique_size+i for i in range(GC.cave_clique_size)} for c in range(GC.cave_num_cliques)]
     f = gopen(expanduser("%s/contact_network_partitions.txt.gz" % GC.out_dir),'wb',9)
     f.write(str(GC.cn_communities).encode()); f.write(b'\n')
     f.close()
     GC.cn_communities = [{str(i) for i in c} for c in GC.cn_communities]
     return out
 def get_edge_list():
     du = GC.d_or_u == 'd'
     cn = random_partition_graph(GC.rpg_sizes, GC.rpg_p_in, GC.rpg_p_out, directed=du, seed=GC.random_number_seed)
     if GC.random_number_seed is not None:
         GC.random_number_seed += 1
     out = GC.nx2favites(cn, GC.d_or_u)
     f = gopen(expanduser("%s/contact_network.txt.gz" % GC.out_dir),'wb',9)
     f.write('\n'.join(out).encode()); f.write(b'\n')
     f.close()
     f = gopen(expanduser("%s/contact_network_partitions.txt.gz" % GC.out_dir),'wb',9)
     f.write(str(cn.graph['partition']).encode()); f.write(b'\n')
     f.close()
     GC.cn_communities = [{str(n) for n in c} for c in cn.graph['partition']]
     return out
 def introduce_sequencing_error(node):
     if not hasattr(GC,"sequencing_file"):
         GC.sequencing_file = gopen('%s/error_prone_files/sequence_data_subsampled_errorprone_read1.fastq.gz'%GC.out_dir, 'wb', 9)
         GC.sequencing_file2 = gopen('%s/error_prone_files/sequence_data_subsampled_errorprone_read2.fastq.gz'%GC.out_dir, 'wb', 9)
     orig_dir = getcwd()
     chdir(GC.out_dir)
     makedirs("DWGSIM_output", exist_ok=True)
     chdir("DWGSIM_output")
     cn_label = node.get_name()
     for t in GC.final_sequences[cn_label]:
         f = NamedTemporaryFile(mode='w')
         for l,s in GC.final_sequences[cn_label][t]:
             f.write(">%s\n%s\n" % (l,s))
         f.flush()
         command = [GC.dwgsim_path] + GC.dwgsim_options
         if GC.random_number_seed is not None:
             command += ['-z',str(GC.random_number_seed)]
             GC.random_number_seed += 1
         command.append(f.name)
         command.append('%s_%f' % (cn_label,t))
         try:
             call(command, stderr=open('%s_%f.log' % (cn_label,t), 'w'))
         except FileNotFoundError:
             chdir(GC.START_DIR)
             assert False, "dwgsim executable was not found: %s" % GC.dwgsim_path
         f.close()
         if isfile('%s_%f.bwa.read1.fastq' % (cn_label,t)):
             f = open('%s_%f.bwa.read1.fastq' % (cn_label,t))
         elif isfile('%s_%f.bwa.read1.fastq.gz' % (cn_label,t)):
             f = gopen('%s_%f.bwa.read1.fastq.gz' % (cn_label,t))
         else:
             raise FileNotFoundError("Couldn't find %s_%f.bwa.read1.fastq or %s_%f.bwa.read1.fastq.gz" % (cn_label,t,cn_label,t))
         for l in f:
             if isinstance(l,bytes):
                 GC.sequencing_file.write(l)
             else:
                 GC.sequencing_file.write(l.encode())
         if isfile('%s_%f.bwa.read2.fastq' % (cn_label,t)):
             f = open('%s_%f.bwa.read2.fastq' % (cn_label,t))
         elif isfile('%s_%f.bwa.read2.fastq.gz' % (cn_label,t)):
             f = gopen('%s_%f.bwa.read2.fastq.gz' % (cn_label,t))
         else:
             raise FileNotFoundError("Couldn't find %s_%f.bwa.read2.fastq or %s_%f.bwa.read2.fastq.gz" % (cn_label,t,cn_label,t))
         for l in f:
             if isinstance(l,bytes):
                 GC.sequencing_file2.write(l)
             else:
                 GC.sequencing_file2.write(l.encode())
     chdir(orig_dir)
def extract_from_path(fpath):
    try:
        fp = gopen(fpath,'rb')
        outputf = "%s,%s\t%s\t%s\n"
        _bg,_ed,_src,_eids = "","","",[]
        for (i,l) in enumerate(fp):
            _row = extract_kv_pair(l)
            if not _bg: _bg=_row[0]
            _ed = _row[0]
            if _row[2]:
                #preparing to print out and reset
                _src = _row[2]
                _eids.append(_row[1])
                sys.stdout.write(outputf%(_bg,_ed,','.join(_eids),_src))
                _bg,_ed,_src,_eids = "","","",[]
            else:
                #append uid to the list
                _eids.append(_row[1])

        if len(_eids)>0:
            sys.stdout.write(outputf%(_bg,_ed,','.join(_eids),'NA'))

        fp.close()
    except:
        pass
Exemple #9
0
 def get_edge_list():
     if GC.contact_network_file.lower().endswith('.gz'):
         from gzip import open as gopen
         lines = [
             i.decode().strip() for i in gopen(GC.contact_network_file)
             if len(i.strip()) > 0 and i.strip()[0] != '#'
         ]
     else:
         lines = [
             i.strip() for i in open(GC.contact_network_file)
             if len(i.strip()) > 0 and i.strip()[0] != '#'
         ]
     for line in lines:
         parts = [e.strip() for e in line.split()]
         assert parts[0] in {
             'NODE', 'EDGE'
         }, "Invalid contact network format. First column must be NODE or EDGE"
         if parts[0] == 'NODE':
             assert len(
                 parts
             ) == 3, "Invalid contact network format. NODE rows must have 3 columns"
         else:
             assert len(
                 parts
             ) == 5, "Invalid contact network format. EDGE rows must have 5 columns"
             assert parts[-1] in {
                 'd', 'u'
             }, 'Invalid contact network format. The last column of EDGE rows must be either "d" or "u"'
     return lines
Exemple #10
0
def FastqReader(fastq_file):
    """ Simple fastq reader returning a generator over a fastq file """
    try:

        # Open the file depending of the compression status
        fastq = gopen(fastq_file, "rb") if fastq_file[-2:] == "gz" else open(fastq_file, "rb")
        i = 0

        # Iterate on the file until the end
        while True:

            # Extract informations from the fastq file
            name, seq, sep, qual = next(fastq), next(fastq), next(fastq), next(fastq)

            # Try to generate a valid FastqSeq object
            try:
                yield FastqSeq(name=name.rstrip()[1:].split()[0], seq=seq.rstrip(), qual=qual.rstrip())

                i += 1

            except AssertionError as E:
                print(E)
                print("Skipping the sequence")

    except IOError as E:
        print(E)
        print("Error while reading {} file".format(fastq_file))
        exit()

    except StopIteration:
        raise StopIteration("\t{} sequences parsed".format(i))
 def introduce_sequencing_error(node):
     orig_dir = getcwd()
     chdir(GC.out_dir)
     makedirs("ART_output", exist_ok=True)
     chdir("ART_output")
     cn_label = node.get_name()
     for t in GC.final_sequences[cn_label]:
         f = NamedTemporaryFile(mode='w')
         for l, s in GC.final_sequences[cn_label][t]:
             f.write(">%s\n%s\n" % (l, s))
         f.flush()
         command = [GC.art_SOLiD_path] + GC.art_SOLiD_options
         if GC.random_number_seed is not None:
             command += ['-r', str(GC.random_number_seed)]
             GC.random_number_seed += 1
         command.append(f.name)
         command.append('%s_%f' % (cn_label, t))
         command.append(str(GC.art_SOLiD_len_read))
         command.append(str(GC.art_SOLiD_fold_coverage))
         try:
             call(command, stdout=open('%s_%f.log' % (cn_label, t), 'w'))
         except FileNotFoundError:
             chdir(GC.START_DIR)
             assert False, "art_SOLiD executable was not found: %s" % GC.art_illumina_path
         f.close()
         if not hasattr(GC, "sequencing_file"):
             GC.sequencing_file = gopen(
                 '%s/error_prone_files/sequence_data_subsampled_errorprone.fastq.gz'
                 % GC.out_dir, 'wb', 9)
         for l in open('%s_%f.fq' % (cn_label, t)):
             GC.sequencing_file.write(l.encode())
     chdir(orig_dir)
Exemple #12
0
 def init():
     assert "ContactNetworkGenerator_File" in str(
         MF.modules['ContactNetworkGenerator']
     ), "Must use ContactNetworkGenerator_File module"
     assert "EndCriteria_TransmissionFile" in str(
         MF.modules['EndCriteria']
     ), "Must use EndCriteria_TransmissionFile module"
     assert "TransmissionNodeSample_TransmissionFile" in str(
         MF.modules['TransmissionNodeSample']
     ), "Must use TransmissionNodeSample_TransmissionFile module"
     assert "TransmissionTimeSample_TransmissionFile" in str(
         MF.modules['TransmissionTimeSample']
     ), "Must use TransmissionTimeSample_TransmissionFile module"
     if GC.transmission_network_file.lower().endswith('.gz'):
         from gzip import open as gopen
         GC.transmission_file = [
             i.decode().strip().split()
             for i in gopen(expanduser(GC.transmission_network_file))
             if len(i.strip()) > 0 and i[0] != '#'
         ]
     else:
         GC.transmission_file = [
             i.strip().split()
             for i in open(expanduser(GC.transmission_network_file))
             if len(i.strip()) > 0 and i[0] != '#'
         ]
     for i in range(len(GC.transmission_file)):
         GC.transmission_file[i][2] = float(GC.transmission_file[i][2])
     GC.transmission_num = 0
Exemple #13
0
 def _run(self):
     count = 0
     with gopen(self.pe1) as i:
         for line in i:
             count += 1
     with open(self.output()['read_counts'].path, 'a') as o:
         print(f'{self.sample_name},raw_reads,{count / 4}', file=o)
def stream_file(fn):
    if fn.lower().endswith('.gz'):
        for l in gopen(fn):
            yield l.decode()
    else:
        for l in open(fn):
            yield l
Exemple #15
0
def FastqReader (fastq_file):
    """ Simple fastq reader returning a generator over a fastq file """
    try:

        # Open the file depending of the compression status
        fastq = gopen(fastq_file, "rb") if fastq_file[-2:] == "gz" else open(fastq_file, "rb")
        i=0

        # Iterate on the file until the end
        while True:

            # Extract informations from the fastq file
            name, seq, sep, qual= next(fastq), next(fastq), next(fastq), next(fastq)

            # Try to generate a valid FastqSeq object
            try:
                yield FastqSeq(
                name = name.rstrip()[1:].split()[0],
                seq = seq.rstrip(),
                qual = qual.rstrip())

                i+=1

            except AssertionError as E:
                print(E)
                print ("Skipping the sequence")

    except IOError as E:
        print(E)
        print ("Error while reading {} file".format(fastq_file))
        exit()

    except StopIteration:
        raise StopIteration("\t{} sequences parsed".format(i))
Exemple #16
0
def read_lines(filename):
    if filename == 'stdin':
        return [l.strip() for l in stdin.read().strip().splitlines()]
    elif filename.lower().endswith('.gz'):
        return [l.strip() for l in gopen(filename).read().decode().strip().splitlines()]
    else:
        return [l.strip() for l in open(filename).read().strip().splitlines()]
Exemple #17
0
def opengzip(transmissionHist: str) -> list:
    """
        Helper method - Opens a gzip and returns the lines of the file.
        Parameters
        ----------
        transmissionHist - the gzip to open. the file object with data on
                           tranmissions.
        """

    if isinstance(transmissionHist, str):
        if transmissionHist.lower().endswith('.gz'):
            lines = [
                l.strip()
                for l in gopen(transmissionHist,
                               'rb').read().decode().strip().splitlines()
            ]
        else:
            lines = [
                l.strip()
                for l in open(transmissionHist).read().strip().splitlines()
            ]
    else:
        lines = [
            l.strip() for l in transmissionHist.read().strip().splitlines()
        ]

    return lines
Exemple #18
0
def _gzip_file(path):
    with open(path) as in_file:
        gz_path = '%s.gz' % path
        with gopen(gz_path, "wb") as out_file:
            out_file.writelines(in_file)

    return gz_path
Exemple #19
0
def read_file(fn):
    if fn.lower().endswith('.gz'):
        return [
            l.strip() for l in gopen(fn).read().decode().strip().splitlines()
        ]
    else:
        return [l.strip() for l in open(fn).read().strip().splitlines()]
Exemple #20
0
def load(fname):
    with gopen(fname) as fin:
        try:
            return json.load(fin)
        except Exception as e:
            print("Error reading %s: %s" % (fname, str(e)))
    return None
def main(dir_path, output_dir, fractions):
    script_dir = os.path.dirname(__file__)

    try:
        os.mkdir(output_dir)  # create directory for graphs
    except OSError:
        assert False, "Creation of new folder failed"

    output_folder = os.path.join(script_dir, output_dir)
    fractions_list = [float(fraction) for fraction in fractions.split(',')]
    # iterate over files in contacts folder
    for f in os.listdir(dir_path):
        path = os.path.join(dir_path, f)
        num_nodes = get_num_nodes(path)
        if f.lower().endswith('.gz'):
            cascade = gopen(path)
        elif f.lower().endswith('.txt'):
            cascade = open(path)
        else:
            continue

        # Iterate over all observed fractions we want to generate pkl files for
        for fraction in fractions_list:
            if fraction < 0.1 or fraction > 0.9: continue

            pkl_dump = favites_to_cascade(cascade, num_nodes, fraction)

            # TODO: better prefix name for file
            new_filename = '%s_%0.1f.pkl' % (f, fraction)

            with open(os.path.join(output_folder, new_filename),
                      'wb') as pkl_file:
                pkl.dump(pkl_dump,
                         pkl_file)  # dump to pkl file format required by tool
Exemple #22
0
 def save(obj, nodata=False):
     fname = os.path.join(obj.outdir, obj.name) + '.pkl.gz'
     with gopen(fname, 'w+') as fout:
         pickle.dump(obj, fout)
         fout.close()
     if not nodata:
         IO.saveData(obj)
    def __call__ (self):
        """Launch the extraction of features """

        print("\nExtract feature sequences")

        # Write the fasta file containing the sequences of the selected features
        fasta_out = self.out_name+".fa.gz"
        print("\n  Write fasta output")
        with gopen (fasta_out, "w") as fout:
            for seq_id, gff_sequence in self.gff_parser.gff_dict.items():

                assert seq_id in self.seq_dict, "fasta and gff are incompatible: {} not found in fasta".format(seq_id)

                print ("    Extracting features from sequence {}".format(seq_id))
                for feature in gff_sequence.features:

                    fout.write(">{}\n{}\n".format(
                            str(feature).replace("\t", self.separator).replace(" ", "_"),
                            self.extract_seq(seq_id, feature.start, feature.end, feature.strand)))

        # Write the gff file containing the selected features if required
        if self.output_gff:
            gff_out = self.out_name+".gff.gz"
            print("\n  Write gff output")
            with gopen (self.out_name+".gff.gz", "w") as fout:
                fout.write(str(self.gff_parser))

        # Write a report
        report_out = self.out_name+".report.txt"
        print ("\n  Generate a summary report")
        with open (report_out, "w") as fout:
            fout.write ("Program {}\tDate {}\n".format(self.VERSION,str(datetime.today())))
            fout.write ("\n### OPTIONS ###\n")
            fout.write ("Original fasta\t{}\n".format(self.fasta))
            fout.write ("Original gff\t{}\n".format(self.gff))
            fout.write ("Offset\t{}\n".format(self.offset))
            fout.write ("Fusion\t{}\n".format(self.fusion))
            fout.write ("Output gff\t{}\n".format(self.output_gff))
            fout.write ("Restricted features\t{}\n".format("\t".join(self.features)))
            fout.write ("Restricted chromosomes\t{}\n".format("\t".join(self.chromosomes)))
            fout.write ("\n### COUNTS ###\n")
            fout.write ("Sequence(s) in gff file\t{}\n".format(self.gff_parser.all_seq))
            fout.write ("Valid sequence(s) in gff file\t{}\n".format(self.gff_parser.valid_seq))
            fout.write ("Features(s) in gff file\t{}\n".format(self.gff_parser.all_features))
            fout.write ("Valid features(s) in gff file\t{}\n".format(self.gff_parser.valid_features))
            if self.fusion:
                fout.write ("Remaining features after fusion\t{}\n".format(self.gff_parser.fused_features))
Exemple #24
0
def get_flankdb(flankpath):
    Path(flankpath).mkdir(parents=True, exist_ok=True)

    print("[-] Preparing flanking virulence gene database")

    patric = fetch_url(
        'ftp://ftp.patricbrc.org/specialty_genes/referenceDBs/PATRIC_VF.faa', None, flankpath + '/patric.faa')
    victors = fetch_url(
        'http://www.phidias.us/victors/downloads/gen_downloads_protein.php', None, flankpath + '/victors.faa')
    vfdb = fetch_url(
        'http://www.mgc.ac.cn/VFs/Down/VFDB_setB_pro.fas.gz', None, flankpath + '/vfdb.faa.gz')

    params = {'query': 'siderophore AND '
                       'taxonomy:"Bacteria [2]" AND '
                       'NOT receptor NOT partial NOT fragment', 'format': 'fasta'}
    bgcs = fetch_url('http://www.uniprot.org/uniprot/', params, flankpath + '/bgcs.faa')

    filenames = [patric, victors, vfdb, bgcs]

    db = ''

    for fname in filenames:
        if fname.endswith('.gz'):
            with gopen(fname, 'rt') as infile:
                for line in infile:
                    db += line
        else:
            with open(fname, 'rt') as infile:
                for line in infile:
                    db += line
        remove(fname)

    d1 = db.count('>')
    print(f"[-] {d1} total proteins downloaded")
    accessions = set()
    db2 = ''
    for r in parse(StringIO(db), 'fasta'):
        if r.id not in accessions:
            accessions.add(r.id)
            db2 += r.format('fasta')
    d2 = db2.count('>')
    print(f"[-] Removed {d1 - d2} duplicate accessions")

    fasta_lines = db2.split('>')[0:]  # splits each sequence by header

    def remove_complete_duplicates(fasta_lines):
        print(f"[>] Removing redundancy...  ", end="", flush=True)
        outputlist, setofuniqsequence = [], set()
        for sequence in fasta_lines:
            if sequence not in setofuniqsequence:
                outputlist.append(sequence)
                setofuniqsequence.add(sequence)
        print(f"{len(outputlist)} proteins remaining")
        return outputlist

    with open(flankpath + '/flankdb', 'w')  as flank_file:
        flank_file.write('>'.join(remove_complete_duplicates(fasta_lines)))

    return run_makeblastdb(flankpath + '/flankdb', 'prot', flankpath + '/flankdb')
Exemple #25
0
def ConvertTar2TarGz(temp_tarfile, dest_targzfile):
    try:
        os.remove(destination)
    except:
        pass
    with open(temp_tarfile, 'rb') as f_in:
        with gopen(dest_targzfile, 'wb') as f_out:
            copyfileobj(f_in, f_out)
Exemple #26
0
def saveCache():
    try:
        f = gopen(w.persistFile, 'w')
        dump(cache, f, -1)
        dump(g.immortal, f, -1)
        f.close()
    except:
        print('saveCache("%s") failed' % w.persistFile)
 def get_edge_list():
     orig_dir = getcwd()
     makedirs(PANGEASIM_OUTPUT_DIR, exist_ok=True)
     chdir(PANGEASIM_OUTPUT_DIR)
     outfile = open('log.txt','w')
     try:
         call([GC.PangeaSim_Acute], stderr=STDOUT, stdout=outfile); outfile.close()
     except:
         outfile.close(); raise RuntimeError("PangeaSim crashed. See %s/log.txt for information" % PANGEASIM_OUTPUT_DIR)
     cn_list = []; GC.transmission_file = []; infected_by_acute = set()
     for f in glob('*.csv'):
         if f.startswith('phylogenetic_individualdata'): # individual attributes
             for l in open(f):
                 if l.startswith('Id') or len(l.strip()) == 0: # header line
                     continue
                 p = l.strip().split(',')
                 cn_list.append('NODE\t%s\t%s' % (p[0],','.join(p[1:])))
             remove(f)
         elif f.startswith('phylogenetic_transmission'): # transmission network
             for l in open(f):
                 if l.startswith('IdInfector') or len(l.strip()) == 0: # header line
                     continue
                 u,v,t,acute_infector = l.strip().split(',')
                 if u == '-1': # seed infection
                     u = None
                 GC.transmission_file.append((u,v,float(t)))
                 if acute_infector.strip() == '1':
                     infected_by_acute.add(v)
             remove(f)
         elif f.startswith('Annual'):
             tmp = gopen('../PangeaSim_annual_survey.csv.gz','wb',9)
             tmp.write(open(f).read().encode())
             tmp.close()
             remove(f)
     assert len(cn_list) != 0 and len(GC.transmission_file) != 0, "PangeaSim error. See %s/log.txt for information" % PANGEASIM_OUTPUT_DIR
     for u,v,t in GC.transmission_file:
         if u is not None:
             cn_list.append('EDGE\t%s\t%s\t%s\td' % (u,v,{True:'AcuteInfector',False:'NonAcuteInfector'}[v in infected_by_acute]))
     f = gopen(expanduser("%s/contact_network.txt.gz" % GC.out_dir),'wb',9)
     f.write(b'# Attributes: Id,Sex,DoB,DoD,HIV_pos,RiskGp,t_diagnosed,cd4_diagnosis,cd4atfirstART,t_1stARTstart,t_1stVLsupp_start,t_1stVLsupp_stop\n')
     f.write('\n'.join(cn_list).encode()); f.write(b'\n')
     f.close()
     remove('log.txt')
     chdir(orig_dir)
     rmdir(PANGEASIM_OUTPUT_DIR)
     return cn_list
Exemple #28
0
def write_file(s, fn):
    if fn.lower().endswith('.gz'):
        f = gopen(fn, 'wb', 9)
        f.write(s.encode())
    else:
        f = open(fn, 'w')
        f.write(s)
    f.close()
Exemple #29
0
    def writer(self, R1_outname, R2_outname):
        """
        Write sequence couples from outqueue in a pair of fastq files. Sequences will remains
        paired (ie at the same index in the 2 files) but they may not be in the same order
        than in the input fastq files. The process will continue until n = n_thead STOP pills were
        found in the outqueue (ie. the queue is empty)
        """
        # Open output fastq streams for writing
        try:
            out_R1 = gopen(R1_outname, "wb") if self.compress_output else open(
                R1_outname, "wb")
            out_R2 = gopen(R2_outname, "wb") if self.compress_output else open(
                R2_outname, "wb")

            current_seq = 0
            buffer_R1 = ""
            buffer_R2 = ""

            # Keep running until all thread STOP pills has been passed
            for works in range(self.n_thread):
                # Will exit the loop as soon as a STOP pill will be found
                for read1, read2 in iter(self.outq.get, "STOP"):

                    with self.total_pass.get_lock():
                        self.total_pass.value += 1

                    buffer_R1 += read1.fastqstr
                    buffer_R2 += read2.fastqstr

                    if self.total_pass.value % self.buffer_size == 0:
                        out_R1.write(buffer_R1)
                        out_R2.write(buffer_R2)
                        buffer_R1 = ""
                        buffer_R2 = ""

            out_R1.write(buffer_R1)
            out_R2.write(buffer_R2)
            buffer_R1 = ""
            buffer_R2 = ""

            out_R1.close()
            out_R2.close()

        except IOError as e:
            print "I/O error({}): {}".format(e.errno, e.strerror)
 def on_data(self, data):
     with gopen("%s/tweet_%d.txt.gz" % (self.folder, self.next_id),
                "wt+") as fout:
         fout.write(data)
         print('.', )
     self.next_id += 1
     if self.next_id % 20 == 0:
         print()
     return True
Exemple #31
0
 def get_edge_list():
     cn = complete_graph(GC.num_cn_nodes)
     out = GC.nx2favites(cn, 'u')
     f = gopen(expanduser("%s/contact_network.txt.gz" % GC.out_dir), 'wb',
               9)
     f.write('\n'.join(out).encode())
     f.write(b'\n')
     f.close()
     return out
Exemple #32
0
 def run(self):
     count = 0
     with gopen(self.reads.output()['reads_1'].local_path()) as i:
         for line in i:
             count += 1
     count /= 4
     target = self.output()['read_count']
     target.set_payload(count)
     target.upload()
Exemple #33
0
def extractAlignmentsRX(f_align, f_align_p, f_stats):
    """ Extracts the alignments with regex.

    Easier to parse HUN aligned files, which will be dropped due to inconsistencies. Mainly used for the small
    OpenSubtitles corpus not the 2011er one.
    """
    print "Extracting alignments"

    alignments = {}
    final = {}
    hun_files = set()
    doc_count = 0
    link_count = 0

    with gopen(f_align) as align_f:
        for line in align_f:
            line = line.strip()

            if line.startswith("<linkGrp"):
                doc_count += 1
                m = search("fromDoc=\"(.+)\"\stoDoc=\"(.+)\"", line)

                if m:
                    key = (m.group(1), m.group(2))
                elif not m:
                    m = search("toDoc=\"(.+)\"\sfromDoc=\"(.+)\"", line)
                    key = (m.group(2), m.group(1))
                alignments.setdefault(key, [])
            elif line.startswith("<link id="):
                link_count += 1
                m = search("xtargets=\"(.+?)\"", line)
                alignments[key].append(m.group(1).split(";"))
            elif line.startswith("<link certainty="):
                hun_files.add(key)

                if key in alignments:
                    del alignments[key]
                continue

    empty = set()

    for k, v in alignments.iteritems():
        if len(v) != 0:
            final.setdefault(k, v)
        else:
            empty.add(k)
    dumpStruct(f_align_p, final)
    createPath(f_stats)

    with open(f_stats, "w") as stats:
            stats.write("DOCS: %d\nHUN: %d\nEMPTY: %d\nLEFT: %d\nLINKS: %d\n\n" %
                       (doc_count, len(hun_files), len(empty), len(final), link_count))

            for k in hun_files:
                stats.write(k[0] + " || " + k[1] + "\n")
            stats.write("\n")
Exemple #34
0
def openreadfile(filename):
    '''
Open a file for reading.
Use gzip.open if it's a .gz file.
    '''
    from gzip import open as gopen
    if 'gz' in filename:
        f = gopen(filename,'rb')
    else:
        f = open(filename,'r')
    return(f)    
Exemple #35
0
 def decompress(self, filename):
   basename = filename.split('.')[:-1]
   txt_file = '.'.join(basename)
   logging.info('Decompressing %s to %s', filename, txt_file)
   with open(txt_file, 'w') as tf:
     with gopen(filename, 'rb') as gf:
       buffer = gf.read(4096)
       while buffer:
         tf.write(buffer)
         buffer = gf.read(4096)
   logging.info('Decompressing %s finished', filename)
Exemple #36
0
def extractAlignmentsLXML(f_align, f_align_p, f_stats):
    """ Extracts alignment information from the alignments file with LXML.

    Used for the large OpenSubtitles 2011 corpus for faster processing.
    """
    print "Extracting alignments"

    class Target(object):
        def __init__(self):
            self.d = dict()
            self.n_links = 0
            self.n_docs = 0

        def start(self, tag, attr):
            if tag == "linkGrp":
                self.n_docs += 1
                self.k = (attr["fromDoc"], attr["toDoc"])
                self.group = self.d[self.k] = []
            elif tag == "link":
                self.n_links += 1
                self.group.append(tuple(attr["xtargets"].split(";")))

                if "certainty" in attr:
                    print "Attention HUN: %s" % self.k

        def close(self):
            pass

    with gopen(f_align) as xml:
        targets = Target()
        parser = etree.XMLParser(target=targets)
        etree.parse(xml, parser)

    alignments = targets.d

    # Documents with no alignments
    empty = set()

    for k, v in alignments.iteritems():
        if not len(v):
            empty.add(k)
            del targets.d[k]

    dumpStruct(f_align_p, alignments)
    createPath(f_stats)

    with open(f_stats, "w") as stats:
        stats.write("DOCS: %d\nEMPTY: %d\nLEFT: %d\nLINKS: %d\n\n" %
                    (targets.n_docs, len(empty), len(alignments), targets.n_links))

        for k in empty:
            stats.write("!!! Empty files\n%s || %s\n" % (k[0], k[1]))
            stats.write("\n")
Exemple #37
0
def _summary_not_demultiplexed(artifact_type, filepaths):
    """Generates the HTML summary for non Demultiplexed artifacts

    Parameters
    ----------
    artifact_type : str
        The artifact type
    filepaths : [(str, str)]
        A list of string pairs where the first element is the filepath and the
        second is the filepath type

    Returns
    -------
    list
        A list of strings with the html summary
    """
    # loop over each of the fps/fps_type pairs
    artifact_information = []
    for fps_type, fps in sorted(filepaths.items()):
        # Step 2: generate HTML summary
        # md5, from http://stackoverflow.com/a/3431838
        for fp in fps:
            with open(fp, "rb") as f:
                hash_md5 = md5()
                for chunk in iter(lambda: f.read(4096), b""):
                    hash_md5.update(chunk)

            # getting head of the files
            header = []
            if artifact_type not in FILEPATH_TYPE_TO_NOT_SHOW_HEAD:
                # we need to encapsulate the full for loop because gzip will
                # not raise an error until you try to read
                try:
                    with gopen(fp, 'r') as fin:
                        header = [line for line, _ in zip(
                            fin, xrange(LINES_TO_READ_FOR_HEAD))]
                except IOError:
                    with open(fp, 'r') as fin:
                        header = [line for line, _ in zip(
                            fin, xrange(LINES_TO_READ_FOR_HEAD))]
            filename = basename(fp)
            artifact_information.append(
                "<h3>%s (%s)</h3>" % (filename, fps_type))
            artifact_information.append("<b>MD5:</b>: %s</br>" %
                                        hash_md5.hexdigest())
            if header:
                artifact_information.append(
                    "<p style=\"font-family:'Courier New', Courier, monospace;"
                    "font-size:10;\">%s</p><hr/>" % ("<br/>".join(header)))

    return artifact_information
Exemple #38
0
    def __init__(self, len_seq=500, n_seq=1, gziped=False):
        self.seq_dict = {}
        self.temp_dir = mkdtemp()

        if gziped:
            self.fasta_path = path.join(self.temp_dir+"/random.fa.gz")
            with gopen (self.fasta_path, "w") as fp:
                for i in range(n_seq):
                    seq = rDNA(len_seq)
                    self.seq_dict["seq_{}".format(i)] = seq
                    fp.write (">seq_{}\n{}\n".format(i, seq))
        else:
            self.fasta_path = path.join(self.temp_dir+"/random.fa")
            with open (self.fasta_path, "w") as fp:
                for i in range(n_seq):
                    seq = rDNA(len_seq)
                    self.seq_dict["seq_{}".format(i)] = seq
                    fp.write (">seq_{}\n{}\n".format(i, seq))
Exemple #39
0
    def __call__(self):
        """ Simple fastq reader returning a generator over a fastq file """
        try:

            # Open the file depending of the compression status
            fastq = gopen(self.fastq_file, "rb") if self.fastq_file[-2:] == "gz" else open(self.fastq_file, "rb")
            

            # Iterate on the file until the end
            while True:
    
                # Extract informations from the fastq file
                name, seq, sep, qual = next(fastq), next(fastq), next(fastq), next(fastq)
                
                split_name = name.split(":")
                
                # Try to generate a valid FastqSeq object
                try:
                    yield FastqSeq(
                    sampleName =  ":".join(split_name[0:-2])[1:],
                    seq = seq.rstrip(),
                    qual = qual.rstrip(),
                    sampleIndex = split_name[-2].rstrip(),
                    molecularIndex = split_name[-1].rstrip())
                
                    self.n_seq += 1
                 
                except AssertionError as E:
                    print(E)
                    print ("Skipping the sequence")

        except IOError as E:
            print(E)
            print ("Error while reading {} file".format(self.fastq_file))
            exit()

        except StopIteration:
            raise StopIteration("\t{} sequences parsed".format(self.n_seq))
            fastq.close()
Exemple #40
0
    def output_reference (self):
        """
        Output a reference corresponding to the original sequenced but masked with a masking
        character for bases overlapped by a BlastHit.
        """
        # Count the number of hit in all Sequence objects from the Reference
        if not self.n_hit:
            return None

        # Write a new compressed reference in the current folder
        elif self.compress:
            with gopen (self.modified_fasta, "wb") as fasta:
                for seq in self.seq_dict.values():
                    # Write the sequence in the fasta file
                    fasta.write(">{}\n{}\n".format(seq.name, seq.output_sequence()))
            return self.modified_fasta

        # Write a new uncompressed reference in the current folder
        else:
            with open (self.modified_fasta, "w") as fasta:
                for seq in self.seq_dict.values():
                    # Write the sequence in the fasta file
                    fasta.write(">{}\n{}\n".format(seq.name, seq.output_sequence()))
            return self.modified_fasta
Exemple #41
0
def quality_plot(fnam, r_enz=None, nreads=None, axe=None, savefig=None, paired=False):
    """
    Plots the sequencing quality of a given FASTQ file. If a restrinction enzyme
    (RE) name is provided, can also represent the distribution of digested and
    undigested RE sites and estimate an expected proportion of dangling-ends.

    Proportion of dangling-ends is inferred by counting the number of times a
    dangling-end site, is found at the beginning of any of the reads (divided by
    the number of reads).

    :param fnam: path to FASTQ file
    :param None nreads: max number of reads to read, not necesary to read all
    :param None savefig: path to a file where to save the image generated;
       if None, the image will be shown using matplotlib GUI (the extension
       of the file name will determine the desired format).
    :param False paired: is input FASTQ contains both ends

    :returns: the percentage of dangling-ends (sensu stricto) and the percentage of
       reads with at least a ligation site.
    """
    phred = dict([(c, i) for i, c in enumerate(
        '!"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~')])
    quals = []
    henes = []
    sites = []
    fixes = []
    liges = []
    ligep = 0
    tkw = dict(size=4, width=1.5)
    if fnam.endswith('.gz'):
        fhandler = gopen(fnam)
    elif fnam.endswith('.dsrc'):
        proc = Popen(['dsrc', 'd', '-t8', '-s', fnam], stdout=PIPE)
        fhandler = proc.stdout
    else:
        fhandler = open(fnam)
    if not r_enz:
        if nreads:
            while True:
                try:
                    next(fhandler)
                except EOFError:
                    break
                seq = next(fhandler)
                if 'N' in seq:
                    henes.extend([i for i, s in enumerate(seq) if s == 'N'])
                next(fhandler)
                line = next(fhandler)
                quals.append([phred[i] for i in line.strip()])
                if len(quals) > nreads:
                    break
        else: # do this because it's faster
            while True:
                try:
                    next(fhandler)
                except EOFError:
                    break
                seq = next(fhandler)
                if 'N' in seq:
                    henes.extend([i for i, s in enumerate(seq) if s == 'N'])
                next(fhandler)
                line = next(fhandler)
                quals.append([phred[i] for i in line.strip()])
    else:
        r_site = RESTRICTION_ENZYMES[r_enz].replace('|', '')
        l_site = religated(r_enz)
        d_site = repaired(r_enz)
        if r_site*2 == l_site:
            # in case the religated site equals 2 restriction sites (like DnpII)
            site = re.compile('(?<!%s)' % r_site + r_site + '(?!%s)' % r_site)
            fixe = re.compile('(?<!%s)' % d_site + d_site + '(?!%s)' % d_site)
        else:
            site = re.compile(r_site)
            fixe = re.compile(d_site)
        lige = re.compile(l_site)
        if nreads:
            while True:
                try:
                    next(fhandler)
                except StopIteration:
                    break
                seq = next(fhandler)
                sites.extend([m.start() for m in site.finditer(seq)])
                fixes.extend([m.start() for m in fixe.finditer(seq)])
                liges.extend([m.start() for m in lige.finditer(seq)])
                ligep += l_site in seq
                if 'N' in seq:
                    henes.extend([i for i, s in enumerate(seq) if s == 'N'])
                next(fhandler)
                line = next(fhandler)
                quals.append([phred[i] for i in line.strip()])
                if len(quals) > nreads:
                    break
        else: # do this because it's faster
            while True:
                try:
                    next(fhandler)
                except StopIteration:
                    break
                seq = next(fhandler)
                sites.extend([m.start() for m in site.finditer(seq)])
                fixes.extend([m.start() for m in fixe.finditer(seq)])
                liges.extend([m.start() for m in lige.finditer(seq)])
                ligep += l_site in seq
                if 'N' in seq:
                    henes.extend([i for i, s in enumerate(seq) if s == 'N'])
                next(fhandler)
                line = next(fhandler)
                quals.append([phred[i] for i in line.strip()])
    fhandler.close()
    if not nreads:
        nreads = len(quals)
    quals = zip(*quals)
    meanquals = [np.mean(q) for q in quals]
    errorquals = [np.std(q) for q in quals]

    if axe:
        ax = axe
        fig = axe.get_figure()
        ax2 = fig.add_subplot(212)
    else:
        if r_enz:
            _, (ax, ax2) = plt.subplots(2,1, figsize=(15, 12))
        else:
            _, ax = plt.subplots(1,1, figsize=(15, 6))
        ax.patch.set_facecolor('lightgrey')
        ax.patch.set_alpha(0.4)
        ax.grid(ls='-', color='w', lw=1.5, alpha=0.6, which='major')
        ax.grid(ls='-', color='w', lw=1, alpha=0.3, which='minor')
        ax.set_axisbelow(True)
        # remove tick marks
        ax.tick_params(axis='both', direction='out', top=False, right=False,
                       left=False, bottom=False)
        ax.tick_params(axis='both', direction='out', top=False, right=False,
                       left=False, bottom=False, which='minor')
    ax.errorbar(range(len(line.strip())), meanquals,
                linewidth=1, elinewidth=1, color='darkblue',
                yerr=errorquals, ecolor='orange')

    ax.set_xlim((0, len(line)))
    ax.set_xlabel('Nucleotidic position')
    ax.set_ylabel('PHRED score')
    ax.set_title('Sequencing Quality (%d reads)' % (nreads))
    ax.yaxis.label.set_color('darkblue')
    ax.tick_params(axis='y', colors='darkblue', **tkw)
    axb = ax.twinx()
    axb.plot([henes.count(i) for i in xrange(len(line))], linewidth=1,
             color='black', linestyle='--')
    axb.yaxis.label.set_color('black')
    axb.tick_params(axis='y', colors='black', **tkw)
    axb.set_ylabel('Number of "N" per position')
    try: # no Ns found (yes... it happens)
        axb.set_yscale('log')
        axb.set_ylim((0, axb.get_ylim()[1] * 1000))
    except ValueError:
        axb.set_yscale('linear')
    ax.set_ylim((0, ax.get_ylim()[1]))
    ax.set_xlim((0, len(line)))

    if r_enz:
        ax.set_title('Sequencing Quality and deconvolution (%s %d reads)' % (
            r_enz, nreads))
        ax.set_xlabel('')
        plt.setp(ax.get_xticklabels(), visible=False)
        ax2.patch.set_facecolor('lightgrey')
        ax2.patch.set_alpha(0.4)
        ax2.grid(ls='-', color='w', lw=1.5, alpha=0.6, which='major')
        ax2.grid(ls='-', color='w', lw=1, alpha=0.3, which='minor')
        ax2.set_axisbelow(True)
        ax2.set_xlabel('Nucleotidic position')
        seq_len = len(line) - max((len(r_site), len(l_site), len(d_site)))
        sites = [sites.count(k) for k in xrange(seq_len)] # Undigested
        liges = [liges.count(k) for k in xrange(seq_len)] # OK
        fixes = [fixes.count(k) for k in xrange(seq_len)] # DE
        if d_site in r_site:
            pos = r_site.find(d_site)
            fixes = (fixes[:pos] +
                     [fixes[k] - sites[k-pos] for k in xrange(pos, seq_len)])
        if d_site in l_site:
            pos = l_site.find(d_site)
            fixes = (fixes[:pos] +
                     [fixes[k] - liges[k-pos] for k in xrange(pos, seq_len)])
        site_len = max((len(r_site), len(l_site), len(d_site)))
        if paired:
            sites[len(line) / 2 - site_len:
                  len(line) / 2] = [float('nan')] * site_len
            liges[len(line) / 2 - site_len:
                  len(line) / 2] = [float('nan')] * site_len
            fixes[len(line) / 2 - site_len:
                  len(line) / 2] = [float('nan')] * site_len
        ax2.plot(sites, linewidth=2, color='darkred')
        ax2.set_ylabel('Undigested RE site (%s)' % r_site)
        ax2.yaxis.label.set_color('darkred')
        ax2.tick_params(axis='y', colors='darkred', **tkw)
        ax3 = ax2.twinx()
        ax3.plot(liges, linewidth=2, color='darkblue')
        ax3.yaxis.label.set_color('darkblue')
        ax3.tick_params(axis='y', colors='darkblue', **tkw)
        ax3.set_ylabel('Religated (%s)' % l_site)
        if any([f > 0 for f in fixes]):
            ax4 = ax2.twinx()
            ax4.spines["right"].set_position(("axes", 1.07))
            make_patch_spines_invisible(ax4)
            ax4.spines["right"].set_visible(True)        
            ax4.plot(fixes, linewidth=2, color='darkorange')
            ax4.yaxis.label.set_color('darkorange')
            ax4.tick_params(axis='y', colors='darkorange', **tkw)
            ax4.set_ylabel('Dangling-ends (%s)' % d_site)
        else:
            ax2.set_ylabel('RE site & Dangling-ends  (%s)' % r_site)
        ax2.set_xlim((0, len(line)))
        lig_cnt = (np.nansum(liges) - liges[0] - liges[len(line) / 2])
        sit_cnt = (np.nansum(sites) - sites[0] - sites[len(line) / 2])
        des = ((100. * (fixes[0] + (fixes[(len(line) / 2)]
                                            if paired else 0)))
                       / nreads) if any([f > 0 for f in fixes]) else (
            100. * (sites[0] + (sites[(len(line) / 2)] if paired else 0))) / nreads
        plt.title(('Percentage of digested sites: %.0f%%, of dangling-ends: %.0f%%\n' +
                   'Percentage of reads with ligation site: %.0f%%') %(
                      (100. * lig_cnt) / (lig_cnt + sit_cnt),
                      des,
                      (ligep * 100.) / nreads))
        plt.subplots_adjust(right=0.85)
    if savefig:
        tadbit_savefig(savefig)
        plt.close('all')
    elif not axe:
        plt.show()
    return des, (ligep * 100.) / nreads
Exemple #42
0
def quality_plot(fnam, nreads=None, axe=None, savefig=None):
    """
    Plot the qualities

    :param fnam: path to FASTQ file
    :param None nreads: max number of reads to read, not necesary to read all
    :param None savefig: path to a file where to save the image generated;
       if None, the image will be shown using matplotlib GUI (the extension
       of the file name will determine the desired format).
    """
    phred = '!"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~'
    quals = []
    if fnam.endswith('.gz'):
        fhandler = gopen(fnam)
    else:
        fhandler = open(fnam)
    if nreads:
        while True:
            try:
                next(fhandler)
            except EOFError:
                break
            next(fhandler)
            next(fhandler)
            line = next(fhandler)
            quals.append([phred.index(i) for i in line.strip()])
            if len(quals) > nreads:
                break
    else: # do this because it's faster
        while True:
            try:
                next(fhandler)
            except EOFError:
                break
            next(fhandler)
            next(fhandler)
            next(fhandler)
            line = next(fhandler)
            quals.append([phred.index(i) for i in line.strip()])
    fhandler.close()

    quals = zip(*quals)
    meanquals = [np.mean(q) for q in quals]
    errorquals = [np.std(q) for q in quals]

    if axe:
        ax = axe
        fig = axe.get_figure()
        plt.clf()
    else:
        fig = plt.figure()
        plt.clf()
        ax = fig.add_subplot(111)
        ax.patch.set_facecolor('lightgrey')
        ax.patch.set_alpha(0.4)
        ax.grid(ls='-', color='w', lw=1.5, alpha=0.6, which='major')
        ax.grid(ls='-', color='w', lw=1, alpha=0.3, which='minor')
        ax.set_axisbelow(True)
        # remove tick marks
        ax.tick_params(axis='both', direction='out', top=False, right=False,
                       left=False, bottom=False)
        ax.tick_params(axis='both', direction='out', top=False, right=False,
                       left=False, bottom=False, which='minor')
    plt.figure(figsize=(15, 7))
    plt.errorbar(range(len(line.strip())), meanquals,
                 yerr=errorquals, ecolor='orange')

    plt.xlim((0, len(line)))
    plt.xlabel('Sequence')
    plt.ylabel('PHRED score')
    if savefig:
        tadbit_savefig(savefig)
    elif not axe:
        plt.show()
from classes import UniprotKB, Refseq

with Database.db as cursor :

	cursor.execute( "TRUNCATE TABLE " + Config.DB_NAME + ".protein_mapping" )
	Database.db.commit( )
	
	uniprotKB = UniprotKB.UniprotKB( Database.db, cursor )
	refseq = Refseq.Refseq( Database.db, cursor )
	
	refseqHash = refseq.buildFullRefseqMappingHash( )
	uniprotHash = uniprotKB.buildAccessionHash( )
	
	mapping = set( )
	
	with gopen( Config.EG_REFSEQ2UNIPROT ) as file :
		
		insertCount = 0
		for line in file.readlines( ) :
			line = line.strip( )
			
			# Skip Blank Lines and header lines
			if len( line ) <= 0 or "#" == line[0] :
				continue
				
			splitLine = line.split( "\t" )
			refseqAcc = splitLine[0].strip( )
			uniprotAcc = splitLine[1].strip( )
			
			if refseqAcc in refseqHash and uniprotAcc in uniprotHash :
				refseqID = refseqHash[refseqAcc]
Exemple #44
0
    def process_single_end (self):

        count = OrderedDict()

        ##### CUTADAPT #####
        if self.skip_cutadapt:
            print ("\nSkiping cutadapt step")
            trimmed_fastq=self.fastq_R1

        else:
            print ("\nStarting trimming with CUTADAPT")
            trimmed_fastq = self.basename+"_trim.fastq.gz"
            cutadapt_report = self.basename+"_trim_report.txt"

            cmd = "cutadapt {} {} {} -o {}".format (self.cutadapt_opt, \
            "-a file:"+self.adapter if self.adapter else "", self.fastq_R1, trimmed_fastq)
            print (cmd)

            if self.run:
                with open (cutadapt_report, "w") as fout:
                    for line in self.yield_cmd(cmd):
                        fout.write(line)

                # Extract values from cutadapt_report
                with open (cutadapt_report, "r") as fin:
                    for line in fin:
                        if line.startswith("Total reads processed:"):
                            count["Total reads before trimming"] = int(line.split()[-1].replace(",",""))
                        if line.startswith("Reads with adapters:"):
                            count["Reads with adapters"] = int(line.split()[-2].replace(",",""))
                        if line.startswith("Reads that were too short:"):
                            count["Reads that were too short"] = int(line.split()[-2].replace(",",""))
                        if line.startswith("Reads written (passing filters):"):
                            count["Reads after trimming"] = int(line.split()[-2].replace(",",""))

        ##### BWA #####
        print ("\nStart aligning with BWA MEM and sort reads")
        mapped_bam = self.basename+"_mapped.bam"
        unmapped_fastq = self.basename+"_clean.fastq.gz"

        # Prepare the command line
        cmd = "bwa mem {0} -t {1} {2} {3}".format(self.bwa_opt, self.thread, self.index, trimmed_fastq)
        print (cmd)

        if self.run:
            #counters
            total = mapped = unmapped = 0

            # Initialize the stream line per line generator
            sam = self.yield_cmd(cmd)

            # Initialize and parse the bam header
            h = BAMHeader ()
            for line in sam:

                # Add the line to BAMheader object until the first non header line is found
                if line.startswith("@"):
                    h.add_header_line(line)
                else:
                    break

            # Initialize a bam read parser
            bam_parser = BAMSequenceParser (header=h, skip_secondary=False)

            # Create an output bam file for mapped reads and an ouput fastq file for unmapped reads
            with \
                pysam.AlignmentFile (mapped_bam , "wb", header=h.header) as bam_out,\
                gopen (unmapped_fastq, "w") as fastq_out:

                # Process the first sequence found when parsing header
                read = bam_parser.parse_line(line)
                total += 1
                if read:
                    if read.is_properly_mapped(self.min_mapq, self.min_match_size):
                        bam_out.write(read.to_bam())
                        mapped += 1
                    else:
                        fastq_out.write(read.to_fastq())
                        unmapped += 1

                # Process the remaining sequences
                for line in sam:
                    read = bam_parser.parse_line(line)
                    total += 1
                    if total % 1000000 == 0:
                        print ("{} sequence processed".format(total))
                    if read:
                        if read.is_properly_mapped(self.min_mapq, self.min_match_size):
                            bam_out.write(read.to_bam())
                            mapped += 1
                        else:
                            fastq_out.write(read.to_fastq())
                            unmapped += 1

            # Retrieve Count values from the BAMSequenceParser object
            count["Total reads processed by BWA"] = total
            count["Reads Mapped"] = mapped
            count["Reads Unmapped"] = unmapped
            count["Primary read"] = bam_parser.count["primary"]
            count["Secondary read"] = bam_parser.count["secondary"]
            count["Invalid reads"] = bam_parser.count["invalid"]

        return count
	cursor.execute( "SELECT uniprot_id FROM " + Config.DB_NAME + ".uniprot WHERE organism_id=%s", [organismID] )
	
	for row in cursor.fetchall( ) :
	
		uniprotID = row[0]
	
		# Inactivate proteins in these two tables only for the specific organism
		cursor.execute( "UPDATE " + Config.DB_NAME + ".uniprot SET uniprot_status='inactive' WHERE uniprot_id = %s", [uniprotID] )
		cursor.execute( "UPDATE " + Config.DB_NAME + ".uniprot_features SET uniprot_feature_status='inactive' WHERE uniprot_id = %s", [uniprotID] )
		
		# Delete associated annotation that goes with the deactivated records
		cursor.execute( "DELETE FROM " + Config.DB_NAME + ".uniprot_aliases WHERE uniprot_id=%s", [uniprotID] )
		cursor.execute( "DELETE FROM " + Config.DB_NAME + ".uniprot_definitions WHERE uniprot_id=%s", [uniprotID] )
		cursor.execute( "DELETE FROM " + Config.DB_NAME + ".uniprot_externals WHERE uniprot_id=%s", [uniprotID] )
		cursor.execute( "DELETE FROM " + Config.DB_NAME + ".uniprot_go WHERE uniprot_id=%s", [uniprotID] )
		
	Database.db.commit( )
		
	filename = Config.UP_PROTEINS_DIR + "uniprot_proteins_" + str(organismID) + ".xml.gz"
	print "Working on : " + str(organismID) + " (" + filename + ")"
		
	with gopen( filename ) as uniprotFile :
		parse( uniprotFile, UniprotProteinXMLParser.UniprotProteinXMLParser( uniprotKB, organismID ) )
	
	Database.db.commit( )

	cursor.execute( "INSERT INTO " + Config.DB_STATS + ".update_tracker VALUES ( '0', 'UNIPROT_updateProteins', NOW( ) )" )
	Database.db.commit( )
	
sys.exit( )
			
Exemple #46
0
def myopen(fname, mode='r'):
    if fname[-2:] == 'gz':
        from gzip import open as gopen
        return gopen(fname, mode)
    else:
        return open(fname, mode)
# Parses UNIPROT_SWISSPROT file and pull out only accessions
# into staging area table

import sys, string
import MySQLdb
import Database
import Config

from classes import UniprotAccessionXMLParser
from xml.sax import parse
from gzip import open as gopen

with Database.db as cursor :

	cursor.execute( "TRUNCATE TABLE " + Config.DB_STAGING + ".swissprot_ids" )
	Database.db.commit( )

	with gopen( Config.UP_SWISSPROT ) as swissprot :
		parse( swissprot, UniprotAccessionXMLParser.UniprotAccessionXMLParser( ) )
	
sys.exit( )
Exemple #48
0
def cleanCopyDocuments(f_align_p, f_corpus, f_clean, f_stats, f_rem, filter=True):
    """ Copies the documents with alignment in a clean format to a new folder as text files.
    """
    align_p_f = loadStruct(f_align_p)
    stopwords = getStopwords()
    n_docs = len(align_p_f)
    words_total = 0
    words_lost = 0
    sents_lost = 0

    with open(f_rem, "w") as rem_f:
        for i, key in enumerate(align_p_f.iterkeys()):
            if i % 500 == 0:
                print "Documents: %d/%d" % (i, n_docs)
            elif i == 0 or i == n_docs - 1:
                print "Documents: %d/%d" % (i + 1, n_docs)

            for lang in key:
                fname = f_clean + lang.replace(".gz", "")
                createPath(fname)

                with copen(fname, "w", encoding="utf-8") as xml_f:
                    doc = []
                    last_id = 0
                    words = 0

                    with gopen(f_corpus + lang) as clean_f:
                        for line in clean_f:
                            line = line.strip()

                            if line.startswith("<s"):
                                last_id = match('.*id="([0-9]+)"', line).group(1)
                                doc.append([])
                            if line.startswith("<w"):
                                m = match(".*>(.+)</", line)
                                if m:
                                    word = m.group(1)
                                    words += 1
                                    if lang.startswith("en"):
                                        words_total += 1
                                        word = word.strip().lower().replace("'", "")

                                        if filter and word not in stopwords and len(word) > 1 and word.isalpha():
                                            doc[-1].append(word)
                                        elif not filter:
                                            doc[-1].append(word)
                                        else:
                                            words_lost += 1
                                    elif lang.startswith("de"):
                                        doc[-1].append(word)

                    xml_f.write(
                        '<?xml version="1.0" encoding="utf-8"?>\n<d s="%s" w="%s" f="%s">\n'
                        % (last_id, words, lang.replace(".gz", ""))
                    )

                    for k, v in enumerate(doc):
                        sid = k + 1

                        if len(v) > 1:
                            xml_f.write('<s id="%s">%s</s>\n' % (sid, " ".join(v).decode("utf-8")))
                        if len(v) <= 1:
                            sents_lost += 1
                            rem_f.write("[R] %s %s %s\n" % (str(key), lang[0:2], sid))

                            for projection in align_p_f[key]:
                                if lang.startswith("de") and str(sid) in projection[0].split(" "):
                                    align_p_f[key].remove(projection)
                                    break
                                elif lang.startswith("en") and str(sid) in projection[1].split(" "):
                                    align_p_f[key].remove(projection)
                                    break
                    xml_f.write("</d>\n")
                    xml_f.flush()
    with open(f_stats, "a") as stats_f:
        stats_f.write("Removed: %d sentences\n" % sents_lost)
        scount = 0

        for v in align_p_f.itervalues():
            scount += len(v)

        stats_f.write("Remaining: %d sentences\n" % scount)
        stats_f.write("Total words: %d\n" % words_total)
        stats_f.write("Words lost: %d\n" % words_lost)
        stats_f.write("Words remmaining: %d\n" % (words_total - words_lost))

    dumpStruct(f_align_p, align_p_f)
from gzip import open as gopen
from classes import UniprotKB

descRE = re.compile( '^([A-Z0-9]+_{1}[A-Z0-9]+) (.*?) (OS=.*?)? (GN=(.*?))?$', re.VERBOSE )

with Database.db as cursor :

	cursor.execute( "UPDATE " + Config.DB_NAME + ".uniprot_isoforms SET uniprot_isoform_status='inactive'" )
	Database.db.commit( )

	uniprotKB = UniprotKB.UniprotKB( Database.db, cursor )
	accessionHash = uniprotKB.buildAccessionHash( )
	organismHash = uniprotKB.buildOrganismHash( )
	
	with gopen( Config.UP_ISOFORMS ) as file :
			
		currentInfo = { }
			
		for line in file.readlines( ) :
			line = line.strip( )
				
			# Skip Blank Lines
			if len( line ) <= 0 :
				continue
				
			if ">" == line[0] :
				if len( currentInfo ) > 0 :
										
					if currentInfo["ACCESSION"] in accessionHash :
						uniprotID = accessionHash[currentInfo["ACCESSION"]]
Exemple #50
0
def quality_plot(fnam, r_enz=None, nreads=float('inf'), axe=None, savefig=None, paired=False):
    """
    Plots the sequencing quality of a given FASTQ file. If a restrinction enzyme
    (RE) name is provided, can also represent the distribution of digested and
    undigested RE sites and estimate an expected proportion of dangling-ends.

    Proportion of dangling-ends is inferred by counting the number of times a
    dangling-end site, is found at the beginning of any of the reads (divided by
    the number of reads).

    :param fnam: path to FASTQ file
    :param None nreads: max number of reads to read, not necesary to read all
    :param None savefig: path to a file where to save the image generated;
       if None, the image will be shown using matplotlib GUI (the extension
       of the file name will determine the desired format).
    :param False paired: is input FASTQ contains both ends

    :returns: the percentage of dangling-ends (sensu stricto) and the percentage of
       reads with at least a ligation site.
    """
    phred = dict([(c, i) for i, c in enumerate(
        '!"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~')])
    if isinstance(r_enz, list):
        r_enzs = r_enz
    elif isinstance(r_enz, str):
        r_enzs = [r_enz]
    for k in RESTRICTION_ENZYMES.keys():
        for i in range(len(r_enzs)):
            if k.lower() == r_enz[i].lower():
                r_enz[i] = k
    # else let it as None

    quals = []
    henes = []
    sites = {}
    fixes = {}
    liges = {}
    ligep = {}
    tkw = dict(size=4, width=1.5)
    if fnam.endswith('.gz'):
        fhandler = gopen(fnam)
    elif fnam.endswith('.dsrc'):
        proc = Popen(['dsrc', 'd', '-t8', '-s', fnam], stdout=PIPE)
        fhandler = proc.stdout
    else:
        fhandler = open(fnam)
    if not r_enzs:
        if nreads:
            while True:
                try:
                    next(fhandler)
                except EOFError:
                    break
                seq = next(fhandler)
                if 'N' in seq:
                    henes.extend([i for i, s in enumerate(seq) if s == 'N'])
                next(fhandler)
                line = next(fhandler)
                quals.append([phred[i] for i in line.strip()])
                if len(quals) > nreads:
                    break
        else: # do this because it's faster
            while True:
                try:
                    next(fhandler)
                except EOFError:
                    break
                seq = next(fhandler)
                if 'N' in seq:
                    henes.extend([i for i, s in enumerate(seq) if s == 'N'])
                next(fhandler)
                line = next(fhandler)
                quals.append([phred[i] for i in line.strip()])
    else:
        r_sites = {}
        d_sites = {}
        for r_enz in r_enzs:
            r_sites[r_enz] = RESTRICTION_ENZYMES[r_enz].replace('|', '')
            d_sites[r_enz] = repaired(r_enz)
            sites[r_enz] = []  # initialize dico to store undigested sites
            fixes[r_enz] = []  # initialize dico to store digested sites
        l_sites = religateds(r_enzs)
        site = {}
        fixe = {}
        for r_enz in r_enzs:
            site[r_enz] = re.compile(r_sites[r_enz])
            fixe[r_enz] = re.compile(d_sites[r_enz])
        # ligation sites should appear in lower case in the sequence
        lige = {}
        for k in l_sites:
            liges[k] = []  # initialize dico to store sites
            ligep[k] = 0   # initialize dico to store sites
            l_sites[k] = l_sites[k].lower()
            lige[k] = re.compile(l_sites[k])
        while len(quals) <= nreads:
            try:
                next(fhandler)
            except StopIteration:
                break
            seq = next(fhandler)
            # ligation sites replaced by lower case to ease the search
            for lig in l_sites.values():
                seq = seq.replace(lig.upper(), lig)
            for r_enz in r_enzs:
                sites[r_enz].extend([m.start() for m in site[r_enz].finditer(seq)])
                # TODO: you cannot have a repaired/fixed site in the middle of
                # the sequence, this could be only checked at the beginning
                fixes[r_enz].extend([m.start() for m in fixe[r_enz].finditer(seq)])
            for k  in lige:  # for each paired of cut-site
                liges[k].extend([m.start() for m in lige[k].finditer(seq)])
                ligep[k] += l_sites[k] in seq
            # store the number of Ns found in the sequences
            if 'N' in seq:
                henes.extend([i for i, s in enumerate(seq) if s == 'N'])
            next(fhandler)
            line = next(fhandler)
            quals.append([phred[i] for i in line.strip()])
    fhandler.close()
    if not nreads:
        nreads = len(quals)
    quals = izip_longest(*quals, fillvalue=float('nan'))
    meanquals, errorquals = zip(*[(nanmean(q), nanstd(q)) for q in quals])
    max_seq_len = len(meanquals)

    if axe:
        ax = axe
        fig = axe.get_figure()
        ax2 = fig.add_subplot(212)
    else:  # configure plot
        if r_enz:  # do both plots
            _, (ax, ax2) = plt.subplots(2,1, figsize=(15, 12))
        else:  # only do the quality_plot plot
            _, ax = plt.subplots(1,1, figsize=(15, 6))
        ax.patch.set_facecolor('lightgrey')
        ax.patch.set_alpha(0.4)
        ax.grid(ls='-', color='w', lw=1.5, alpha=0.6, which='major')
        ax.grid(ls='-', color='w', lw=1, alpha=0.3, which='minor')
        ax.set_axisbelow(True)
        # remove tick marks
        ax.tick_params(axis='both', direction='out', top=False, right=False,
                       left=False, bottom=False)
        ax.tick_params(axis='both', direction='out', top=False, right=False,
                       left=False, bottom=False, which='minor')

    ax.errorbar(range(max_seq_len), meanquals,
                linewidth=1, elinewidth=1, color='darkblue',
                yerr=errorquals, ecolor='orange')

    ax.set_xlim((0, max_seq_len))
    ax.set_xlabel('Nucleotidic position')
    ax.set_ylabel('PHRED score')
    ax.set_title('Sequencing Quality (%d reads)' % (nreads))
    ax.yaxis.label.set_color('darkblue')
    ax.tick_params(axis='y', colors='darkblue', **tkw)
    axb = ax.twinx()
    # quality_plot plot
    axb.plot([henes.count(i) for i in xrange(max_seq_len)], linewidth=1,
             color='black', linestyle='--')
    axb.yaxis.label.set_color('black')
    axb.tick_params(axis='y', colors='black', **tkw)
    axb.set_ylabel('Number of "N" per position')
    try: # no Ns found (yes... it happens)
        axb.set_yscale('log')
        axb.set_ylim((0, axb.get_ylim()[1] * 1000))
    except ValueError:
        axb.set_yscale('linear')
    ax.set_ylim((0, ax.get_ylim()[1]))
    ax.set_xlim((0, max_seq_len))

    # Hi-C plot
    if r_enzs:
        ax.set_title('Sequencing Quality and deconvolution (%s %d reads)' % (
            ', '.join(r_enzs), nreads))
        ax.set_xlabel('')
        plt.setp(ax.get_xticklabels(), visible=False)
        ax2.patch.set_facecolor('lightgrey')
        ax2.patch.set_alpha(0.4)
        ax2.grid(ls='-', color='w', lw=1.5, alpha=0.6, which='major')
        ax2.grid(ls='-', color='w', lw=1, alpha=0.3, which='minor')
        ax2.set_axisbelow(True)
        ax2.set_xlabel('Nucleotidic position')

        # seq_len is the length of the line to plot. we don't want to plot
        # if there is no room for the cut-site, or ligation site.
        site_len = max((max([len(r_sites[k]) for k in r_sites]),
                                   max([len(l_sites[k]) for k in l_sites]),
                                   max([len(d_sites[k]) for k in d_sites])))
        seq_len = max_seq_len - site_len

        # transform dictionaries of positions into dictionaries of counts
        for r_enz in sites:
            sites[r_enz] = [sites[r_enz].count(k) for k in xrange(seq_len)] # Undigested
            fixes[r_enz] = [fixes[r_enz].count(k) for k in xrange(seq_len)] # DE
        for r1, r2 in liges:
            liges[(r1, r2)] = [liges[(r1, r2)].count(k) for k in xrange(seq_len)] # OK

        # in case the pattern of the repaired cut-site contains the target
        # cut-site pattern. These sites were counted twice, once in the
        # undigested, and once in the repaired. We remove them from the
        # repaired:
        for r_enz in r_enzs:
            if d_sites[r_enz] in r_sites[r_enz]:
                pos = r_sites[r_enz].find(d_sites[r_enz])

                fixes[r_enz] = (fixes[r_enz][:pos] +
                                [fixes[r_enz][k] - sites[r_enz][k-pos]
                                 for k in xrange(pos, seq_len)])
        # same for ligated sites
        for r_enz1 in r_enzs:
            for r_enz2 in r_enzs:
                if d_sites[r_enz1] not in l_sites[(r_enz1, r_enz2)]:
                    continue
                pos = l_sites[(r_enz1, r_enz2)].find(d_sites[r_enz1])
                fixes[r_enz1] = (fixes[r_enz1][:pos] +
                                 [fixes[r_enz1][k] - liges[(r_enz1, r_enz2)][k - pos]
                                  for k in xrange(pos, seq_len)])

        # remove anything that could be in between the two read ends
        if paired:
            for k in sites:
                sites[k][max_seq_len / 2 - site_len:
                         max_seq_len / 2] = [float('nan')] * site_len
                fixes[k][max_seq_len / 2 - site_len:
                         max_seq_len / 2] = [float('nan')] * site_len
            for k in liges:
                liges[k][max_seq_len / 2 - site_len:
                         max_seq_len / 2] = [float('nan')] * site_len

        # plot undigested cut-sites
        color = iter(plt.cm.Reds(linspace(0.3, 0.95, len(r_enzs))))
        for r_enz in sites:
            # print 'undigested', r_enz
            # print sites[r_enz][:20]
            ax2.plot(sites[r_enz], linewidth=2, color = color.next(),
                     alpha=0.9,
                     label='Undigested RE site (%s: %s)' % (r_enz, r_sites[r_enz])
                     if any([f > 0 for f in fixes[r_enz]])
                     else 'Undigested & Dangling-Ends (%s: %s)' % (r_enz, r_sites[r_enz]))
        ax2.set_ylabel('Undigested')
        ax2.yaxis.label.set_color('darkred')
        ax2.tick_params(axis='y', colors='darkred', **tkw)

        lines, labels = ax2.get_legend_handles_labels()

        ax3 = ax2.twinx()
        color = iter(plt.cm.Blues(linspace(0.3, 0.95, len(liges))))
        for r1, r2 in liges:
            # print 'ligated', r1, r2
            # print liges[(r1, r2)][:20]
            ax3.plot(liges[(r1, r2)], linewidth=2, color=color.next(),
                     alpha=0.9,
                     label = 'Ligated (%s-%s: %s)' % (r1, r2, l_sites[(r1, r2)].upper()))
        ax3.yaxis.label.set_color('darkblue')
        ax3.tick_params(axis='y', colors='darkblue', **tkw)
        ax3.set_ylabel('Ligated')

        tmp_lines, tmp_labels = ax3.get_legend_handles_labels()
        lines.extend(tmp_lines)
        labels.extend(tmp_labels)

        color = iter(plt.cm.Greens(linspace(0.3, 0.95, len(r_enzs))))
        for i, r_enz in enumerate(r_enzs):
            if any([f > 0 for f in fixes[r_enz]]):
                ax4 = ax2.twinx()
                ax4.spines["right"].set_position(("axes", 1.07))
                make_patch_spines_invisible(ax4)
                ax4.spines["right"].set_visible(True)
                # print 'repaired', r_enz
                # print fixes[r_enz][:20]
                ax4.plot(fixes[r_enz], linewidth=2, color=color.next(),
                         alpha=0.9,
                         label='Dangling-ends (%s: %s)' % (r_enz, d_sites[r_enz]))
                ax4.yaxis.label.set_color('darkgreen')
                ax4.tick_params(axis='y', colors='darkgreen', **tkw)
                ax4.set_ylabel('Dangling-ends')
                tmp_lines, tmp_labels = ax4.get_legend_handles_labels()
                lines.extend(tmp_lines)
                labels.extend(tmp_labels)
            else:
                ax2.set_ylabel('Undigested & Dangling-ends')
        ax2.set_xlim((0, max_seq_len))

        # Count ligation sites
        lig_cnt = {}
        for k in liges:
            lig_cnt[k] = (nansum(liges[k]) - liges[k][0] -
                              liges[k][max_seq_len / 2])

        # Count undigested sites
        sit_cnt = {}
        for r_enz in r_enzs:
            sit_cnt[r_enz] = (nansum(sites[r_enz]) - sites[r_enz][0] -
                              sites[r_enz][max_seq_len / 2])

        # Count Dangling-Ends
        des = {}
        for r_enz in r_enzs:
            if any([f > 0 for f in fixes[r_enz]]):
                des[r_enz] = ((100. * (fixes[r_enz][0] + (fixes[r_enz][(max_seq_len / 2)]
                                                          if paired else 0))) / nreads)
            else:
                des[r_enz] = (100. * (sites[r_enz][0] + (sites[r_enz][(max_seq_len / 2)]
                                                         if paired else 0))) / nreads

        # Decorate plot
        title = ''
        for r_enz in r_enzs:
            lcnt = float(sum([lig_cnt[(r_enz1, r_enz2)] * (2 if r_enz1 == r_enz2 else 1)
                              for r_enz1 in r_enzs for r_enz2 in r_enzs
                              if r_enz1 == r_enz or r_enz2 == r_enz]))
            title += ('Percentage of digested sites (not considering Dangling-Ends) '
                      '%s: %.1f%%\n' % (r_enz,
                                        100. * float(lcnt) / (lcnt + sit_cnt[r_enz])))
        for r_enz in r_enzs:
            title += 'Percentage of dangling-ends %s: %.1f%%\n' % (r_enz, des[r_enz])

        for r_enz1 in r_enzs:
            for r_enz2 in r_enzs:
                title += ('Percentage of reads with ligation site (%s-%s): %.1f%% \n' %
                          (r_enz1, r_enz2, (ligep[(r_enz1, r_enz2)] * 100.) / nreads))
        plt.title(title.strip(), size=10, ha='left', x=0)
        plt.subplots_adjust(right=0.85)
        ax2.legend(lines, labels, bbox_to_anchor=(0.75, 1.0),
                   loc=3, borderaxespad=0., frameon=False, fontsize=9)
    plt.tight_layout()
    if savefig:
        tadbit_savefig(savefig)
        plt.close('all')
    elif not axe:
        plt.show()
    for k in ligep:
        ligep[k] = (ligep[k] * 100.) / nreads
    return des, ligep