def get_edge_list(): cn = barbell_graph(GC.barbell_m1, GC.barbell_m2) out = GC.nx2favites(cn, 'u') f = gopen(expanduser("%s/contact_network.txt.gz" % GC.out_dir), 'wb', 9) f.write('\n'.join(out).encode()) f.write(b'\n') f.close() GC.cn_communities = [ {i for i in range(GC.barbell_m1)}, { i for i in range(GC.barbell_m1 + GC.barbell_m2, 2 * GC.barbell_m1 + GC.barbell_m2) } ] # only left and right communities, not the path f = gopen( expanduser("%s/contact_network_partitions.txt.gz" % GC.out_dir), 'wb', 9) f.write(str(GC.cn_communities).encode()) f.write(b'\n') f.close() GC.cn_communities = [{str(i) for i in c} for c in GC.cn_communities] return out
def flush_buffers(self): """Append to R1 and R2 fastq.gz files""" with gopen(self.R1_fastq_name, "ab") as fastq_file: fastq_file.write(self.R1_buffer) self.R1_buffer = "" with gopen(self.R2_fastq_name, "ab") as fastq_file: fastq_file.write(self.R2_buffer) self.R2_buffer = ""
def introduce_sequencing_error(node): if not hasattr(GC, "sequencing_file"): if GC.art_454_amplicon_mode == "single": GC.sequencing_file = gopen( '%s/error_prone_files/sequence_data_subsampled_errorprone.fastq.gz' % GC.out_dir, 'wb', 9) else: GC.sequencing_file = gopen( '%s/error_prone_files/sequence_data_subsampled_errorprone_read1.fastq.gz' % GC.out_dir, 'wb', 9) GC.sequencing_file2 = gopen( '%s/error_prone_files/sequence_data_subsampled_errorprone_read2.fastq.gz' % GC.out_dir, 'wb', 9) orig_dir = getcwd() chdir(GC.out_dir) makedirs("ART_output", exist_ok=True) chdir("ART_output") cn_label = node.get_name() for t in GC.final_sequences[cn_label]: f = NamedTemporaryFile(mode='w') for l, s in GC.final_sequences[cn_label][t]: f.write(">%s\n%s\n" % (l, s)) f.flush() command = [GC.art_454_path] + GC.art_454_options if GC.random_number_seed is not None: command += ['-r', str(GC.random_number_seed)] GC.random_number_seed += 1 if GC.art_454_amplicon_mode == "single": command.append('-A') else: command.append('-B') command.append(f.name) command.append('%s_%f' % (cn_label, t)) command.append(str(GC.art_454_reads_pairs_per_amplicon)) try: call(command, stdout=open('%s_%f.log' % (cn_label, t), 'w'), stderr=STDOUT) except FileNotFoundError: chdir(GC.START_DIR) assert False, "art_454 executable was not found: %s" % GC.art_454_path f.close() if GC.art_454_amplicon_mode == "single": for l in open('%s_%f.fq' % (cn_label, t)): GC.sequencing_file.write(l.encode()) else: rename('%s_%f.fq' % (cn_label, t), '%s_%f_read1.fq' % (cn_label, t)) for l in open('%s_%f_read1.fq' % (cn_label, t)): GC.sequencing_file.write(l.encode()) rename('%s_%f2.fq' % (cn_label, t), '%s_%f_read2.fq' % (cn_label, t)) for l in open('%s_%f_read2.fq' % (cn_label, t)): GC.sequencing_file2.write(l.encode()) chdir(orig_dir)
def run(self): for key in ['reads_1', 'reads_2']: reads_in = self.reads.output()[key].local_path() target = self.output()[key] reads_out = target.local_path() with gopen(reads_in) as ifile, gopen(reads_out, 'w') as ofile: for i, line in enumerate(ifile): if (i % 4) == 1: line = line[:CLIP_LEN] + b'\n' ofile.write(line) target.set_payload(reads_out) target.upload()
def get_edge_list(): cn = relaxed_caveman_graph(GC.cave_num_cliques, GC.cave_clique_size, GC.cave_prob, seed=GC.random_number_seed) if GC.random_number_seed is not None: GC.random_number_seed += 1 out = GC.nx2favites(cn, 'u') f = gopen(expanduser("%s/contact_network.txt.gz" % GC.out_dir),'wb',9) f.write('\n'.join(out).encode()); f.write(b'\n') f.close() GC.cn_communities = [{c*GC.cave_clique_size+i for i in range(GC.cave_clique_size)} for c in range(GC.cave_num_cliques)] f = gopen(expanduser("%s/contact_network_partitions.txt.gz" % GC.out_dir),'wb',9) f.write(str(GC.cn_communities).encode()); f.write(b'\n') f.close() GC.cn_communities = [{str(i) for i in c} for c in GC.cn_communities] return out
def get_edge_list(): du = GC.d_or_u == 'd' cn = random_partition_graph(GC.rpg_sizes, GC.rpg_p_in, GC.rpg_p_out, directed=du, seed=GC.random_number_seed) if GC.random_number_seed is not None: GC.random_number_seed += 1 out = GC.nx2favites(cn, GC.d_or_u) f = gopen(expanduser("%s/contact_network.txt.gz" % GC.out_dir),'wb',9) f.write('\n'.join(out).encode()); f.write(b'\n') f.close() f = gopen(expanduser("%s/contact_network_partitions.txt.gz" % GC.out_dir),'wb',9) f.write(str(cn.graph['partition']).encode()); f.write(b'\n') f.close() GC.cn_communities = [{str(n) for n in c} for c in cn.graph['partition']] return out
def introduce_sequencing_error(node): if not hasattr(GC,"sequencing_file"): GC.sequencing_file = gopen('%s/error_prone_files/sequence_data_subsampled_errorprone_read1.fastq.gz'%GC.out_dir, 'wb', 9) GC.sequencing_file2 = gopen('%s/error_prone_files/sequence_data_subsampled_errorprone_read2.fastq.gz'%GC.out_dir, 'wb', 9) orig_dir = getcwd() chdir(GC.out_dir) makedirs("DWGSIM_output", exist_ok=True) chdir("DWGSIM_output") cn_label = node.get_name() for t in GC.final_sequences[cn_label]: f = NamedTemporaryFile(mode='w') for l,s in GC.final_sequences[cn_label][t]: f.write(">%s\n%s\n" % (l,s)) f.flush() command = [GC.dwgsim_path] + GC.dwgsim_options if GC.random_number_seed is not None: command += ['-z',str(GC.random_number_seed)] GC.random_number_seed += 1 command.append(f.name) command.append('%s_%f' % (cn_label,t)) try: call(command, stderr=open('%s_%f.log' % (cn_label,t), 'w')) except FileNotFoundError: chdir(GC.START_DIR) assert False, "dwgsim executable was not found: %s" % GC.dwgsim_path f.close() if isfile('%s_%f.bwa.read1.fastq' % (cn_label,t)): f = open('%s_%f.bwa.read1.fastq' % (cn_label,t)) elif isfile('%s_%f.bwa.read1.fastq.gz' % (cn_label,t)): f = gopen('%s_%f.bwa.read1.fastq.gz' % (cn_label,t)) else: raise FileNotFoundError("Couldn't find %s_%f.bwa.read1.fastq or %s_%f.bwa.read1.fastq.gz" % (cn_label,t,cn_label,t)) for l in f: if isinstance(l,bytes): GC.sequencing_file.write(l) else: GC.sequencing_file.write(l.encode()) if isfile('%s_%f.bwa.read2.fastq' % (cn_label,t)): f = open('%s_%f.bwa.read2.fastq' % (cn_label,t)) elif isfile('%s_%f.bwa.read2.fastq.gz' % (cn_label,t)): f = gopen('%s_%f.bwa.read2.fastq.gz' % (cn_label,t)) else: raise FileNotFoundError("Couldn't find %s_%f.bwa.read2.fastq or %s_%f.bwa.read2.fastq.gz" % (cn_label,t,cn_label,t)) for l in f: if isinstance(l,bytes): GC.sequencing_file2.write(l) else: GC.sequencing_file2.write(l.encode()) chdir(orig_dir)
def extract_from_path(fpath): try: fp = gopen(fpath,'rb') outputf = "%s,%s\t%s\t%s\n" _bg,_ed,_src,_eids = "","","",[] for (i,l) in enumerate(fp): _row = extract_kv_pair(l) if not _bg: _bg=_row[0] _ed = _row[0] if _row[2]: #preparing to print out and reset _src = _row[2] _eids.append(_row[1]) sys.stdout.write(outputf%(_bg,_ed,','.join(_eids),_src)) _bg,_ed,_src,_eids = "","","",[] else: #append uid to the list _eids.append(_row[1]) if len(_eids)>0: sys.stdout.write(outputf%(_bg,_ed,','.join(_eids),'NA')) fp.close() except: pass
def get_edge_list(): if GC.contact_network_file.lower().endswith('.gz'): from gzip import open as gopen lines = [ i.decode().strip() for i in gopen(GC.contact_network_file) if len(i.strip()) > 0 and i.strip()[0] != '#' ] else: lines = [ i.strip() for i in open(GC.contact_network_file) if len(i.strip()) > 0 and i.strip()[0] != '#' ] for line in lines: parts = [e.strip() for e in line.split()] assert parts[0] in { 'NODE', 'EDGE' }, "Invalid contact network format. First column must be NODE or EDGE" if parts[0] == 'NODE': assert len( parts ) == 3, "Invalid contact network format. NODE rows must have 3 columns" else: assert len( parts ) == 5, "Invalid contact network format. EDGE rows must have 5 columns" assert parts[-1] in { 'd', 'u' }, 'Invalid contact network format. The last column of EDGE rows must be either "d" or "u"' return lines
def FastqReader(fastq_file): """ Simple fastq reader returning a generator over a fastq file """ try: # Open the file depending of the compression status fastq = gopen(fastq_file, "rb") if fastq_file[-2:] == "gz" else open(fastq_file, "rb") i = 0 # Iterate on the file until the end while True: # Extract informations from the fastq file name, seq, sep, qual = next(fastq), next(fastq), next(fastq), next(fastq) # Try to generate a valid FastqSeq object try: yield FastqSeq(name=name.rstrip()[1:].split()[0], seq=seq.rstrip(), qual=qual.rstrip()) i += 1 except AssertionError as E: print(E) print("Skipping the sequence") except IOError as E: print(E) print("Error while reading {} file".format(fastq_file)) exit() except StopIteration: raise StopIteration("\t{} sequences parsed".format(i))
def introduce_sequencing_error(node): orig_dir = getcwd() chdir(GC.out_dir) makedirs("ART_output", exist_ok=True) chdir("ART_output") cn_label = node.get_name() for t in GC.final_sequences[cn_label]: f = NamedTemporaryFile(mode='w') for l, s in GC.final_sequences[cn_label][t]: f.write(">%s\n%s\n" % (l, s)) f.flush() command = [GC.art_SOLiD_path] + GC.art_SOLiD_options if GC.random_number_seed is not None: command += ['-r', str(GC.random_number_seed)] GC.random_number_seed += 1 command.append(f.name) command.append('%s_%f' % (cn_label, t)) command.append(str(GC.art_SOLiD_len_read)) command.append(str(GC.art_SOLiD_fold_coverage)) try: call(command, stdout=open('%s_%f.log' % (cn_label, t), 'w')) except FileNotFoundError: chdir(GC.START_DIR) assert False, "art_SOLiD executable was not found: %s" % GC.art_illumina_path f.close() if not hasattr(GC, "sequencing_file"): GC.sequencing_file = gopen( '%s/error_prone_files/sequence_data_subsampled_errorprone.fastq.gz' % GC.out_dir, 'wb', 9) for l in open('%s_%f.fq' % (cn_label, t)): GC.sequencing_file.write(l.encode()) chdir(orig_dir)
def init(): assert "ContactNetworkGenerator_File" in str( MF.modules['ContactNetworkGenerator'] ), "Must use ContactNetworkGenerator_File module" assert "EndCriteria_TransmissionFile" in str( MF.modules['EndCriteria'] ), "Must use EndCriteria_TransmissionFile module" assert "TransmissionNodeSample_TransmissionFile" in str( MF.modules['TransmissionNodeSample'] ), "Must use TransmissionNodeSample_TransmissionFile module" assert "TransmissionTimeSample_TransmissionFile" in str( MF.modules['TransmissionTimeSample'] ), "Must use TransmissionTimeSample_TransmissionFile module" if GC.transmission_network_file.lower().endswith('.gz'): from gzip import open as gopen GC.transmission_file = [ i.decode().strip().split() for i in gopen(expanduser(GC.transmission_network_file)) if len(i.strip()) > 0 and i[0] != '#' ] else: GC.transmission_file = [ i.strip().split() for i in open(expanduser(GC.transmission_network_file)) if len(i.strip()) > 0 and i[0] != '#' ] for i in range(len(GC.transmission_file)): GC.transmission_file[i][2] = float(GC.transmission_file[i][2]) GC.transmission_num = 0
def _run(self): count = 0 with gopen(self.pe1) as i: for line in i: count += 1 with open(self.output()['read_counts'].path, 'a') as o: print(f'{self.sample_name},raw_reads,{count / 4}', file=o)
def stream_file(fn): if fn.lower().endswith('.gz'): for l in gopen(fn): yield l.decode() else: for l in open(fn): yield l
def FastqReader (fastq_file): """ Simple fastq reader returning a generator over a fastq file """ try: # Open the file depending of the compression status fastq = gopen(fastq_file, "rb") if fastq_file[-2:] == "gz" else open(fastq_file, "rb") i=0 # Iterate on the file until the end while True: # Extract informations from the fastq file name, seq, sep, qual= next(fastq), next(fastq), next(fastq), next(fastq) # Try to generate a valid FastqSeq object try: yield FastqSeq( name = name.rstrip()[1:].split()[0], seq = seq.rstrip(), qual = qual.rstrip()) i+=1 except AssertionError as E: print(E) print ("Skipping the sequence") except IOError as E: print(E) print ("Error while reading {} file".format(fastq_file)) exit() except StopIteration: raise StopIteration("\t{} sequences parsed".format(i))
def read_lines(filename): if filename == 'stdin': return [l.strip() for l in stdin.read().strip().splitlines()] elif filename.lower().endswith('.gz'): return [l.strip() for l in gopen(filename).read().decode().strip().splitlines()] else: return [l.strip() for l in open(filename).read().strip().splitlines()]
def opengzip(transmissionHist: str) -> list: """ Helper method - Opens a gzip and returns the lines of the file. Parameters ---------- transmissionHist - the gzip to open. the file object with data on tranmissions. """ if isinstance(transmissionHist, str): if transmissionHist.lower().endswith('.gz'): lines = [ l.strip() for l in gopen(transmissionHist, 'rb').read().decode().strip().splitlines() ] else: lines = [ l.strip() for l in open(transmissionHist).read().strip().splitlines() ] else: lines = [ l.strip() for l in transmissionHist.read().strip().splitlines() ] return lines
def _gzip_file(path): with open(path) as in_file: gz_path = '%s.gz' % path with gopen(gz_path, "wb") as out_file: out_file.writelines(in_file) return gz_path
def read_file(fn): if fn.lower().endswith('.gz'): return [ l.strip() for l in gopen(fn).read().decode().strip().splitlines() ] else: return [l.strip() for l in open(fn).read().strip().splitlines()]
def load(fname): with gopen(fname) as fin: try: return json.load(fin) except Exception as e: print("Error reading %s: %s" % (fname, str(e))) return None
def main(dir_path, output_dir, fractions): script_dir = os.path.dirname(__file__) try: os.mkdir(output_dir) # create directory for graphs except OSError: assert False, "Creation of new folder failed" output_folder = os.path.join(script_dir, output_dir) fractions_list = [float(fraction) for fraction in fractions.split(',')] # iterate over files in contacts folder for f in os.listdir(dir_path): path = os.path.join(dir_path, f) num_nodes = get_num_nodes(path) if f.lower().endswith('.gz'): cascade = gopen(path) elif f.lower().endswith('.txt'): cascade = open(path) else: continue # Iterate over all observed fractions we want to generate pkl files for for fraction in fractions_list: if fraction < 0.1 or fraction > 0.9: continue pkl_dump = favites_to_cascade(cascade, num_nodes, fraction) # TODO: better prefix name for file new_filename = '%s_%0.1f.pkl' % (f, fraction) with open(os.path.join(output_folder, new_filename), 'wb') as pkl_file: pkl.dump(pkl_dump, pkl_file) # dump to pkl file format required by tool
def save(obj, nodata=False): fname = os.path.join(obj.outdir, obj.name) + '.pkl.gz' with gopen(fname, 'w+') as fout: pickle.dump(obj, fout) fout.close() if not nodata: IO.saveData(obj)
def __call__ (self): """Launch the extraction of features """ print("\nExtract feature sequences") # Write the fasta file containing the sequences of the selected features fasta_out = self.out_name+".fa.gz" print("\n Write fasta output") with gopen (fasta_out, "w") as fout: for seq_id, gff_sequence in self.gff_parser.gff_dict.items(): assert seq_id in self.seq_dict, "fasta and gff are incompatible: {} not found in fasta".format(seq_id) print (" Extracting features from sequence {}".format(seq_id)) for feature in gff_sequence.features: fout.write(">{}\n{}\n".format( str(feature).replace("\t", self.separator).replace(" ", "_"), self.extract_seq(seq_id, feature.start, feature.end, feature.strand))) # Write the gff file containing the selected features if required if self.output_gff: gff_out = self.out_name+".gff.gz" print("\n Write gff output") with gopen (self.out_name+".gff.gz", "w") as fout: fout.write(str(self.gff_parser)) # Write a report report_out = self.out_name+".report.txt" print ("\n Generate a summary report") with open (report_out, "w") as fout: fout.write ("Program {}\tDate {}\n".format(self.VERSION,str(datetime.today()))) fout.write ("\n### OPTIONS ###\n") fout.write ("Original fasta\t{}\n".format(self.fasta)) fout.write ("Original gff\t{}\n".format(self.gff)) fout.write ("Offset\t{}\n".format(self.offset)) fout.write ("Fusion\t{}\n".format(self.fusion)) fout.write ("Output gff\t{}\n".format(self.output_gff)) fout.write ("Restricted features\t{}\n".format("\t".join(self.features))) fout.write ("Restricted chromosomes\t{}\n".format("\t".join(self.chromosomes))) fout.write ("\n### COUNTS ###\n") fout.write ("Sequence(s) in gff file\t{}\n".format(self.gff_parser.all_seq)) fout.write ("Valid sequence(s) in gff file\t{}\n".format(self.gff_parser.valid_seq)) fout.write ("Features(s) in gff file\t{}\n".format(self.gff_parser.all_features)) fout.write ("Valid features(s) in gff file\t{}\n".format(self.gff_parser.valid_features)) if self.fusion: fout.write ("Remaining features after fusion\t{}\n".format(self.gff_parser.fused_features))
def get_flankdb(flankpath): Path(flankpath).mkdir(parents=True, exist_ok=True) print("[-] Preparing flanking virulence gene database") patric = fetch_url( 'ftp://ftp.patricbrc.org/specialty_genes/referenceDBs/PATRIC_VF.faa', None, flankpath + '/patric.faa') victors = fetch_url( 'http://www.phidias.us/victors/downloads/gen_downloads_protein.php', None, flankpath + '/victors.faa') vfdb = fetch_url( 'http://www.mgc.ac.cn/VFs/Down/VFDB_setB_pro.fas.gz', None, flankpath + '/vfdb.faa.gz') params = {'query': 'siderophore AND ' 'taxonomy:"Bacteria [2]" AND ' 'NOT receptor NOT partial NOT fragment', 'format': 'fasta'} bgcs = fetch_url('http://www.uniprot.org/uniprot/', params, flankpath + '/bgcs.faa') filenames = [patric, victors, vfdb, bgcs] db = '' for fname in filenames: if fname.endswith('.gz'): with gopen(fname, 'rt') as infile: for line in infile: db += line else: with open(fname, 'rt') as infile: for line in infile: db += line remove(fname) d1 = db.count('>') print(f"[-] {d1} total proteins downloaded") accessions = set() db2 = '' for r in parse(StringIO(db), 'fasta'): if r.id not in accessions: accessions.add(r.id) db2 += r.format('fasta') d2 = db2.count('>') print(f"[-] Removed {d1 - d2} duplicate accessions") fasta_lines = db2.split('>')[0:] # splits each sequence by header def remove_complete_duplicates(fasta_lines): print(f"[>] Removing redundancy... ", end="", flush=True) outputlist, setofuniqsequence = [], set() for sequence in fasta_lines: if sequence not in setofuniqsequence: outputlist.append(sequence) setofuniqsequence.add(sequence) print(f"{len(outputlist)} proteins remaining") return outputlist with open(flankpath + '/flankdb', 'w') as flank_file: flank_file.write('>'.join(remove_complete_duplicates(fasta_lines))) return run_makeblastdb(flankpath + '/flankdb', 'prot', flankpath + '/flankdb')
def ConvertTar2TarGz(temp_tarfile, dest_targzfile): try: os.remove(destination) except: pass with open(temp_tarfile, 'rb') as f_in: with gopen(dest_targzfile, 'wb') as f_out: copyfileobj(f_in, f_out)
def saveCache(): try: f = gopen(w.persistFile, 'w') dump(cache, f, -1) dump(g.immortal, f, -1) f.close() except: print('saveCache("%s") failed' % w.persistFile)
def get_edge_list(): orig_dir = getcwd() makedirs(PANGEASIM_OUTPUT_DIR, exist_ok=True) chdir(PANGEASIM_OUTPUT_DIR) outfile = open('log.txt','w') try: call([GC.PangeaSim_Acute], stderr=STDOUT, stdout=outfile); outfile.close() except: outfile.close(); raise RuntimeError("PangeaSim crashed. See %s/log.txt for information" % PANGEASIM_OUTPUT_DIR) cn_list = []; GC.transmission_file = []; infected_by_acute = set() for f in glob('*.csv'): if f.startswith('phylogenetic_individualdata'): # individual attributes for l in open(f): if l.startswith('Id') or len(l.strip()) == 0: # header line continue p = l.strip().split(',') cn_list.append('NODE\t%s\t%s' % (p[0],','.join(p[1:]))) remove(f) elif f.startswith('phylogenetic_transmission'): # transmission network for l in open(f): if l.startswith('IdInfector') or len(l.strip()) == 0: # header line continue u,v,t,acute_infector = l.strip().split(',') if u == '-1': # seed infection u = None GC.transmission_file.append((u,v,float(t))) if acute_infector.strip() == '1': infected_by_acute.add(v) remove(f) elif f.startswith('Annual'): tmp = gopen('../PangeaSim_annual_survey.csv.gz','wb',9) tmp.write(open(f).read().encode()) tmp.close() remove(f) assert len(cn_list) != 0 and len(GC.transmission_file) != 0, "PangeaSim error. See %s/log.txt for information" % PANGEASIM_OUTPUT_DIR for u,v,t in GC.transmission_file: if u is not None: cn_list.append('EDGE\t%s\t%s\t%s\td' % (u,v,{True:'AcuteInfector',False:'NonAcuteInfector'}[v in infected_by_acute])) f = gopen(expanduser("%s/contact_network.txt.gz" % GC.out_dir),'wb',9) f.write(b'# Attributes: Id,Sex,DoB,DoD,HIV_pos,RiskGp,t_diagnosed,cd4_diagnosis,cd4atfirstART,t_1stARTstart,t_1stVLsupp_start,t_1stVLsupp_stop\n') f.write('\n'.join(cn_list).encode()); f.write(b'\n') f.close() remove('log.txt') chdir(orig_dir) rmdir(PANGEASIM_OUTPUT_DIR) return cn_list
def write_file(s, fn): if fn.lower().endswith('.gz'): f = gopen(fn, 'wb', 9) f.write(s.encode()) else: f = open(fn, 'w') f.write(s) f.close()
def writer(self, R1_outname, R2_outname): """ Write sequence couples from outqueue in a pair of fastq files. Sequences will remains paired (ie at the same index in the 2 files) but they may not be in the same order than in the input fastq files. The process will continue until n = n_thead STOP pills were found in the outqueue (ie. the queue is empty) """ # Open output fastq streams for writing try: out_R1 = gopen(R1_outname, "wb") if self.compress_output else open( R1_outname, "wb") out_R2 = gopen(R2_outname, "wb") if self.compress_output else open( R2_outname, "wb") current_seq = 0 buffer_R1 = "" buffer_R2 = "" # Keep running until all thread STOP pills has been passed for works in range(self.n_thread): # Will exit the loop as soon as a STOP pill will be found for read1, read2 in iter(self.outq.get, "STOP"): with self.total_pass.get_lock(): self.total_pass.value += 1 buffer_R1 += read1.fastqstr buffer_R2 += read2.fastqstr if self.total_pass.value % self.buffer_size == 0: out_R1.write(buffer_R1) out_R2.write(buffer_R2) buffer_R1 = "" buffer_R2 = "" out_R1.write(buffer_R1) out_R2.write(buffer_R2) buffer_R1 = "" buffer_R2 = "" out_R1.close() out_R2.close() except IOError as e: print "I/O error({}): {}".format(e.errno, e.strerror)
def on_data(self, data): with gopen("%s/tweet_%d.txt.gz" % (self.folder, self.next_id), "wt+") as fout: fout.write(data) print('.', ) self.next_id += 1 if self.next_id % 20 == 0: print() return True
def get_edge_list(): cn = complete_graph(GC.num_cn_nodes) out = GC.nx2favites(cn, 'u') f = gopen(expanduser("%s/contact_network.txt.gz" % GC.out_dir), 'wb', 9) f.write('\n'.join(out).encode()) f.write(b'\n') f.close() return out
def run(self): count = 0 with gopen(self.reads.output()['reads_1'].local_path()) as i: for line in i: count += 1 count /= 4 target = self.output()['read_count'] target.set_payload(count) target.upload()
def extractAlignmentsRX(f_align, f_align_p, f_stats): """ Extracts the alignments with regex. Easier to parse HUN aligned files, which will be dropped due to inconsistencies. Mainly used for the small OpenSubtitles corpus not the 2011er one. """ print "Extracting alignments" alignments = {} final = {} hun_files = set() doc_count = 0 link_count = 0 with gopen(f_align) as align_f: for line in align_f: line = line.strip() if line.startswith("<linkGrp"): doc_count += 1 m = search("fromDoc=\"(.+)\"\stoDoc=\"(.+)\"", line) if m: key = (m.group(1), m.group(2)) elif not m: m = search("toDoc=\"(.+)\"\sfromDoc=\"(.+)\"", line) key = (m.group(2), m.group(1)) alignments.setdefault(key, []) elif line.startswith("<link id="): link_count += 1 m = search("xtargets=\"(.+?)\"", line) alignments[key].append(m.group(1).split(";")) elif line.startswith("<link certainty="): hun_files.add(key) if key in alignments: del alignments[key] continue empty = set() for k, v in alignments.iteritems(): if len(v) != 0: final.setdefault(k, v) else: empty.add(k) dumpStruct(f_align_p, final) createPath(f_stats) with open(f_stats, "w") as stats: stats.write("DOCS: %d\nHUN: %d\nEMPTY: %d\nLEFT: %d\nLINKS: %d\n\n" % (doc_count, len(hun_files), len(empty), len(final), link_count)) for k in hun_files: stats.write(k[0] + " || " + k[1] + "\n") stats.write("\n")
def openreadfile(filename): ''' Open a file for reading. Use gzip.open if it's a .gz file. ''' from gzip import open as gopen if 'gz' in filename: f = gopen(filename,'rb') else: f = open(filename,'r') return(f)
def decompress(self, filename): basename = filename.split('.')[:-1] txt_file = '.'.join(basename) logging.info('Decompressing %s to %s', filename, txt_file) with open(txt_file, 'w') as tf: with gopen(filename, 'rb') as gf: buffer = gf.read(4096) while buffer: tf.write(buffer) buffer = gf.read(4096) logging.info('Decompressing %s finished', filename)
def extractAlignmentsLXML(f_align, f_align_p, f_stats): """ Extracts alignment information from the alignments file with LXML. Used for the large OpenSubtitles 2011 corpus for faster processing. """ print "Extracting alignments" class Target(object): def __init__(self): self.d = dict() self.n_links = 0 self.n_docs = 0 def start(self, tag, attr): if tag == "linkGrp": self.n_docs += 1 self.k = (attr["fromDoc"], attr["toDoc"]) self.group = self.d[self.k] = [] elif tag == "link": self.n_links += 1 self.group.append(tuple(attr["xtargets"].split(";"))) if "certainty" in attr: print "Attention HUN: %s" % self.k def close(self): pass with gopen(f_align) as xml: targets = Target() parser = etree.XMLParser(target=targets) etree.parse(xml, parser) alignments = targets.d # Documents with no alignments empty = set() for k, v in alignments.iteritems(): if not len(v): empty.add(k) del targets.d[k] dumpStruct(f_align_p, alignments) createPath(f_stats) with open(f_stats, "w") as stats: stats.write("DOCS: %d\nEMPTY: %d\nLEFT: %d\nLINKS: %d\n\n" % (targets.n_docs, len(empty), len(alignments), targets.n_links)) for k in empty: stats.write("!!! Empty files\n%s || %s\n" % (k[0], k[1])) stats.write("\n")
def _summary_not_demultiplexed(artifact_type, filepaths): """Generates the HTML summary for non Demultiplexed artifacts Parameters ---------- artifact_type : str The artifact type filepaths : [(str, str)] A list of string pairs where the first element is the filepath and the second is the filepath type Returns ------- list A list of strings with the html summary """ # loop over each of the fps/fps_type pairs artifact_information = [] for fps_type, fps in sorted(filepaths.items()): # Step 2: generate HTML summary # md5, from http://stackoverflow.com/a/3431838 for fp in fps: with open(fp, "rb") as f: hash_md5 = md5() for chunk in iter(lambda: f.read(4096), b""): hash_md5.update(chunk) # getting head of the files header = [] if artifact_type not in FILEPATH_TYPE_TO_NOT_SHOW_HEAD: # we need to encapsulate the full for loop because gzip will # not raise an error until you try to read try: with gopen(fp, 'r') as fin: header = [line for line, _ in zip( fin, xrange(LINES_TO_READ_FOR_HEAD))] except IOError: with open(fp, 'r') as fin: header = [line for line, _ in zip( fin, xrange(LINES_TO_READ_FOR_HEAD))] filename = basename(fp) artifact_information.append( "<h3>%s (%s)</h3>" % (filename, fps_type)) artifact_information.append("<b>MD5:</b>: %s</br>" % hash_md5.hexdigest()) if header: artifact_information.append( "<p style=\"font-family:'Courier New', Courier, monospace;" "font-size:10;\">%s</p><hr/>" % ("<br/>".join(header))) return artifact_information
def __init__(self, len_seq=500, n_seq=1, gziped=False): self.seq_dict = {} self.temp_dir = mkdtemp() if gziped: self.fasta_path = path.join(self.temp_dir+"/random.fa.gz") with gopen (self.fasta_path, "w") as fp: for i in range(n_seq): seq = rDNA(len_seq) self.seq_dict["seq_{}".format(i)] = seq fp.write (">seq_{}\n{}\n".format(i, seq)) else: self.fasta_path = path.join(self.temp_dir+"/random.fa") with open (self.fasta_path, "w") as fp: for i in range(n_seq): seq = rDNA(len_seq) self.seq_dict["seq_{}".format(i)] = seq fp.write (">seq_{}\n{}\n".format(i, seq))
def __call__(self): """ Simple fastq reader returning a generator over a fastq file """ try: # Open the file depending of the compression status fastq = gopen(self.fastq_file, "rb") if self.fastq_file[-2:] == "gz" else open(self.fastq_file, "rb") # Iterate on the file until the end while True: # Extract informations from the fastq file name, seq, sep, qual = next(fastq), next(fastq), next(fastq), next(fastq) split_name = name.split(":") # Try to generate a valid FastqSeq object try: yield FastqSeq( sampleName = ":".join(split_name[0:-2])[1:], seq = seq.rstrip(), qual = qual.rstrip(), sampleIndex = split_name[-2].rstrip(), molecularIndex = split_name[-1].rstrip()) self.n_seq += 1 except AssertionError as E: print(E) print ("Skipping the sequence") except IOError as E: print(E) print ("Error while reading {} file".format(self.fastq_file)) exit() except StopIteration: raise StopIteration("\t{} sequences parsed".format(self.n_seq)) fastq.close()
def output_reference (self): """ Output a reference corresponding to the original sequenced but masked with a masking character for bases overlapped by a BlastHit. """ # Count the number of hit in all Sequence objects from the Reference if not self.n_hit: return None # Write a new compressed reference in the current folder elif self.compress: with gopen (self.modified_fasta, "wb") as fasta: for seq in self.seq_dict.values(): # Write the sequence in the fasta file fasta.write(">{}\n{}\n".format(seq.name, seq.output_sequence())) return self.modified_fasta # Write a new uncompressed reference in the current folder else: with open (self.modified_fasta, "w") as fasta: for seq in self.seq_dict.values(): # Write the sequence in the fasta file fasta.write(">{}\n{}\n".format(seq.name, seq.output_sequence())) return self.modified_fasta
def quality_plot(fnam, r_enz=None, nreads=None, axe=None, savefig=None, paired=False): """ Plots the sequencing quality of a given FASTQ file. If a restrinction enzyme (RE) name is provided, can also represent the distribution of digested and undigested RE sites and estimate an expected proportion of dangling-ends. Proportion of dangling-ends is inferred by counting the number of times a dangling-end site, is found at the beginning of any of the reads (divided by the number of reads). :param fnam: path to FASTQ file :param None nreads: max number of reads to read, not necesary to read all :param None savefig: path to a file where to save the image generated; if None, the image will be shown using matplotlib GUI (the extension of the file name will determine the desired format). :param False paired: is input FASTQ contains both ends :returns: the percentage of dangling-ends (sensu stricto) and the percentage of reads with at least a ligation site. """ phred = dict([(c, i) for i, c in enumerate( '!"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~')]) quals = [] henes = [] sites = [] fixes = [] liges = [] ligep = 0 tkw = dict(size=4, width=1.5) if fnam.endswith('.gz'): fhandler = gopen(fnam) elif fnam.endswith('.dsrc'): proc = Popen(['dsrc', 'd', '-t8', '-s', fnam], stdout=PIPE) fhandler = proc.stdout else: fhandler = open(fnam) if not r_enz: if nreads: while True: try: next(fhandler) except EOFError: break seq = next(fhandler) if 'N' in seq: henes.extend([i for i, s in enumerate(seq) if s == 'N']) next(fhandler) line = next(fhandler) quals.append([phred[i] for i in line.strip()]) if len(quals) > nreads: break else: # do this because it's faster while True: try: next(fhandler) except EOFError: break seq = next(fhandler) if 'N' in seq: henes.extend([i for i, s in enumerate(seq) if s == 'N']) next(fhandler) line = next(fhandler) quals.append([phred[i] for i in line.strip()]) else: r_site = RESTRICTION_ENZYMES[r_enz].replace('|', '') l_site = religated(r_enz) d_site = repaired(r_enz) if r_site*2 == l_site: # in case the religated site equals 2 restriction sites (like DnpII) site = re.compile('(?<!%s)' % r_site + r_site + '(?!%s)' % r_site) fixe = re.compile('(?<!%s)' % d_site + d_site + '(?!%s)' % d_site) else: site = re.compile(r_site) fixe = re.compile(d_site) lige = re.compile(l_site) if nreads: while True: try: next(fhandler) except StopIteration: break seq = next(fhandler) sites.extend([m.start() for m in site.finditer(seq)]) fixes.extend([m.start() for m in fixe.finditer(seq)]) liges.extend([m.start() for m in lige.finditer(seq)]) ligep += l_site in seq if 'N' in seq: henes.extend([i for i, s in enumerate(seq) if s == 'N']) next(fhandler) line = next(fhandler) quals.append([phred[i] for i in line.strip()]) if len(quals) > nreads: break else: # do this because it's faster while True: try: next(fhandler) except StopIteration: break seq = next(fhandler) sites.extend([m.start() for m in site.finditer(seq)]) fixes.extend([m.start() for m in fixe.finditer(seq)]) liges.extend([m.start() for m in lige.finditer(seq)]) ligep += l_site in seq if 'N' in seq: henes.extend([i for i, s in enumerate(seq) if s == 'N']) next(fhandler) line = next(fhandler) quals.append([phred[i] for i in line.strip()]) fhandler.close() if not nreads: nreads = len(quals) quals = zip(*quals) meanquals = [np.mean(q) for q in quals] errorquals = [np.std(q) for q in quals] if axe: ax = axe fig = axe.get_figure() ax2 = fig.add_subplot(212) else: if r_enz: _, (ax, ax2) = plt.subplots(2,1, figsize=(15, 12)) else: _, ax = plt.subplots(1,1, figsize=(15, 6)) ax.patch.set_facecolor('lightgrey') ax.patch.set_alpha(0.4) ax.grid(ls='-', color='w', lw=1.5, alpha=0.6, which='major') ax.grid(ls='-', color='w', lw=1, alpha=0.3, which='minor') ax.set_axisbelow(True) # remove tick marks ax.tick_params(axis='both', direction='out', top=False, right=False, left=False, bottom=False) ax.tick_params(axis='both', direction='out', top=False, right=False, left=False, bottom=False, which='minor') ax.errorbar(range(len(line.strip())), meanquals, linewidth=1, elinewidth=1, color='darkblue', yerr=errorquals, ecolor='orange') ax.set_xlim((0, len(line))) ax.set_xlabel('Nucleotidic position') ax.set_ylabel('PHRED score') ax.set_title('Sequencing Quality (%d reads)' % (nreads)) ax.yaxis.label.set_color('darkblue') ax.tick_params(axis='y', colors='darkblue', **tkw) axb = ax.twinx() axb.plot([henes.count(i) for i in xrange(len(line))], linewidth=1, color='black', linestyle='--') axb.yaxis.label.set_color('black') axb.tick_params(axis='y', colors='black', **tkw) axb.set_ylabel('Number of "N" per position') try: # no Ns found (yes... it happens) axb.set_yscale('log') axb.set_ylim((0, axb.get_ylim()[1] * 1000)) except ValueError: axb.set_yscale('linear') ax.set_ylim((0, ax.get_ylim()[1])) ax.set_xlim((0, len(line))) if r_enz: ax.set_title('Sequencing Quality and deconvolution (%s %d reads)' % ( r_enz, nreads)) ax.set_xlabel('') plt.setp(ax.get_xticklabels(), visible=False) ax2.patch.set_facecolor('lightgrey') ax2.patch.set_alpha(0.4) ax2.grid(ls='-', color='w', lw=1.5, alpha=0.6, which='major') ax2.grid(ls='-', color='w', lw=1, alpha=0.3, which='minor') ax2.set_axisbelow(True) ax2.set_xlabel('Nucleotidic position') seq_len = len(line) - max((len(r_site), len(l_site), len(d_site))) sites = [sites.count(k) for k in xrange(seq_len)] # Undigested liges = [liges.count(k) for k in xrange(seq_len)] # OK fixes = [fixes.count(k) for k in xrange(seq_len)] # DE if d_site in r_site: pos = r_site.find(d_site) fixes = (fixes[:pos] + [fixes[k] - sites[k-pos] for k in xrange(pos, seq_len)]) if d_site in l_site: pos = l_site.find(d_site) fixes = (fixes[:pos] + [fixes[k] - liges[k-pos] for k in xrange(pos, seq_len)]) site_len = max((len(r_site), len(l_site), len(d_site))) if paired: sites[len(line) / 2 - site_len: len(line) / 2] = [float('nan')] * site_len liges[len(line) / 2 - site_len: len(line) / 2] = [float('nan')] * site_len fixes[len(line) / 2 - site_len: len(line) / 2] = [float('nan')] * site_len ax2.plot(sites, linewidth=2, color='darkred') ax2.set_ylabel('Undigested RE site (%s)' % r_site) ax2.yaxis.label.set_color('darkred') ax2.tick_params(axis='y', colors='darkred', **tkw) ax3 = ax2.twinx() ax3.plot(liges, linewidth=2, color='darkblue') ax3.yaxis.label.set_color('darkblue') ax3.tick_params(axis='y', colors='darkblue', **tkw) ax3.set_ylabel('Religated (%s)' % l_site) if any([f > 0 for f in fixes]): ax4 = ax2.twinx() ax4.spines["right"].set_position(("axes", 1.07)) make_patch_spines_invisible(ax4) ax4.spines["right"].set_visible(True) ax4.plot(fixes, linewidth=2, color='darkorange') ax4.yaxis.label.set_color('darkorange') ax4.tick_params(axis='y', colors='darkorange', **tkw) ax4.set_ylabel('Dangling-ends (%s)' % d_site) else: ax2.set_ylabel('RE site & Dangling-ends (%s)' % r_site) ax2.set_xlim((0, len(line))) lig_cnt = (np.nansum(liges) - liges[0] - liges[len(line) / 2]) sit_cnt = (np.nansum(sites) - sites[0] - sites[len(line) / 2]) des = ((100. * (fixes[0] + (fixes[(len(line) / 2)] if paired else 0))) / nreads) if any([f > 0 for f in fixes]) else ( 100. * (sites[0] + (sites[(len(line) / 2)] if paired else 0))) / nreads plt.title(('Percentage of digested sites: %.0f%%, of dangling-ends: %.0f%%\n' + 'Percentage of reads with ligation site: %.0f%%') %( (100. * lig_cnt) / (lig_cnt + sit_cnt), des, (ligep * 100.) / nreads)) plt.subplots_adjust(right=0.85) if savefig: tadbit_savefig(savefig) plt.close('all') elif not axe: plt.show() return des, (ligep * 100.) / nreads
def quality_plot(fnam, nreads=None, axe=None, savefig=None): """ Plot the qualities :param fnam: path to FASTQ file :param None nreads: max number of reads to read, not necesary to read all :param None savefig: path to a file where to save the image generated; if None, the image will be shown using matplotlib GUI (the extension of the file name will determine the desired format). """ phred = '!"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~' quals = [] if fnam.endswith('.gz'): fhandler = gopen(fnam) else: fhandler = open(fnam) if nreads: while True: try: next(fhandler) except EOFError: break next(fhandler) next(fhandler) line = next(fhandler) quals.append([phred.index(i) for i in line.strip()]) if len(quals) > nreads: break else: # do this because it's faster while True: try: next(fhandler) except EOFError: break next(fhandler) next(fhandler) next(fhandler) line = next(fhandler) quals.append([phred.index(i) for i in line.strip()]) fhandler.close() quals = zip(*quals) meanquals = [np.mean(q) for q in quals] errorquals = [np.std(q) for q in quals] if axe: ax = axe fig = axe.get_figure() plt.clf() else: fig = plt.figure() plt.clf() ax = fig.add_subplot(111) ax.patch.set_facecolor('lightgrey') ax.patch.set_alpha(0.4) ax.grid(ls='-', color='w', lw=1.5, alpha=0.6, which='major') ax.grid(ls='-', color='w', lw=1, alpha=0.3, which='minor') ax.set_axisbelow(True) # remove tick marks ax.tick_params(axis='both', direction='out', top=False, right=False, left=False, bottom=False) ax.tick_params(axis='both', direction='out', top=False, right=False, left=False, bottom=False, which='minor') plt.figure(figsize=(15, 7)) plt.errorbar(range(len(line.strip())), meanquals, yerr=errorquals, ecolor='orange') plt.xlim((0, len(line))) plt.xlabel('Sequence') plt.ylabel('PHRED score') if savefig: tadbit_savefig(savefig) elif not axe: plt.show()
from classes import UniprotKB, Refseq with Database.db as cursor : cursor.execute( "TRUNCATE TABLE " + Config.DB_NAME + ".protein_mapping" ) Database.db.commit( ) uniprotKB = UniprotKB.UniprotKB( Database.db, cursor ) refseq = Refseq.Refseq( Database.db, cursor ) refseqHash = refseq.buildFullRefseqMappingHash( ) uniprotHash = uniprotKB.buildAccessionHash( ) mapping = set( ) with gopen( Config.EG_REFSEQ2UNIPROT ) as file : insertCount = 0 for line in file.readlines( ) : line = line.strip( ) # Skip Blank Lines and header lines if len( line ) <= 0 or "#" == line[0] : continue splitLine = line.split( "\t" ) refseqAcc = splitLine[0].strip( ) uniprotAcc = splitLine[1].strip( ) if refseqAcc in refseqHash and uniprotAcc in uniprotHash : refseqID = refseqHash[refseqAcc]
def process_single_end (self): count = OrderedDict() ##### CUTADAPT ##### if self.skip_cutadapt: print ("\nSkiping cutadapt step") trimmed_fastq=self.fastq_R1 else: print ("\nStarting trimming with CUTADAPT") trimmed_fastq = self.basename+"_trim.fastq.gz" cutadapt_report = self.basename+"_trim_report.txt" cmd = "cutadapt {} {} {} -o {}".format (self.cutadapt_opt, \ "-a file:"+self.adapter if self.adapter else "", self.fastq_R1, trimmed_fastq) print (cmd) if self.run: with open (cutadapt_report, "w") as fout: for line in self.yield_cmd(cmd): fout.write(line) # Extract values from cutadapt_report with open (cutadapt_report, "r") as fin: for line in fin: if line.startswith("Total reads processed:"): count["Total reads before trimming"] = int(line.split()[-1].replace(",","")) if line.startswith("Reads with adapters:"): count["Reads with adapters"] = int(line.split()[-2].replace(",","")) if line.startswith("Reads that were too short:"): count["Reads that were too short"] = int(line.split()[-2].replace(",","")) if line.startswith("Reads written (passing filters):"): count["Reads after trimming"] = int(line.split()[-2].replace(",","")) ##### BWA ##### print ("\nStart aligning with BWA MEM and sort reads") mapped_bam = self.basename+"_mapped.bam" unmapped_fastq = self.basename+"_clean.fastq.gz" # Prepare the command line cmd = "bwa mem {0} -t {1} {2} {3}".format(self.bwa_opt, self.thread, self.index, trimmed_fastq) print (cmd) if self.run: #counters total = mapped = unmapped = 0 # Initialize the stream line per line generator sam = self.yield_cmd(cmd) # Initialize and parse the bam header h = BAMHeader () for line in sam: # Add the line to BAMheader object until the first non header line is found if line.startswith("@"): h.add_header_line(line) else: break # Initialize a bam read parser bam_parser = BAMSequenceParser (header=h, skip_secondary=False) # Create an output bam file for mapped reads and an ouput fastq file for unmapped reads with \ pysam.AlignmentFile (mapped_bam , "wb", header=h.header) as bam_out,\ gopen (unmapped_fastq, "w") as fastq_out: # Process the first sequence found when parsing header read = bam_parser.parse_line(line) total += 1 if read: if read.is_properly_mapped(self.min_mapq, self.min_match_size): bam_out.write(read.to_bam()) mapped += 1 else: fastq_out.write(read.to_fastq()) unmapped += 1 # Process the remaining sequences for line in sam: read = bam_parser.parse_line(line) total += 1 if total % 1000000 == 0: print ("{} sequence processed".format(total)) if read: if read.is_properly_mapped(self.min_mapq, self.min_match_size): bam_out.write(read.to_bam()) mapped += 1 else: fastq_out.write(read.to_fastq()) unmapped += 1 # Retrieve Count values from the BAMSequenceParser object count["Total reads processed by BWA"] = total count["Reads Mapped"] = mapped count["Reads Unmapped"] = unmapped count["Primary read"] = bam_parser.count["primary"] count["Secondary read"] = bam_parser.count["secondary"] count["Invalid reads"] = bam_parser.count["invalid"] return count
cursor.execute( "SELECT uniprot_id FROM " + Config.DB_NAME + ".uniprot WHERE organism_id=%s", [organismID] ) for row in cursor.fetchall( ) : uniprotID = row[0] # Inactivate proteins in these two tables only for the specific organism cursor.execute( "UPDATE " + Config.DB_NAME + ".uniprot SET uniprot_status='inactive' WHERE uniprot_id = %s", [uniprotID] ) cursor.execute( "UPDATE " + Config.DB_NAME + ".uniprot_features SET uniprot_feature_status='inactive' WHERE uniprot_id = %s", [uniprotID] ) # Delete associated annotation that goes with the deactivated records cursor.execute( "DELETE FROM " + Config.DB_NAME + ".uniprot_aliases WHERE uniprot_id=%s", [uniprotID] ) cursor.execute( "DELETE FROM " + Config.DB_NAME + ".uniprot_definitions WHERE uniprot_id=%s", [uniprotID] ) cursor.execute( "DELETE FROM " + Config.DB_NAME + ".uniprot_externals WHERE uniprot_id=%s", [uniprotID] ) cursor.execute( "DELETE FROM " + Config.DB_NAME + ".uniprot_go WHERE uniprot_id=%s", [uniprotID] ) Database.db.commit( ) filename = Config.UP_PROTEINS_DIR + "uniprot_proteins_" + str(organismID) + ".xml.gz" print "Working on : " + str(organismID) + " (" + filename + ")" with gopen( filename ) as uniprotFile : parse( uniprotFile, UniprotProteinXMLParser.UniprotProteinXMLParser( uniprotKB, organismID ) ) Database.db.commit( ) cursor.execute( "INSERT INTO " + Config.DB_STATS + ".update_tracker VALUES ( '0', 'UNIPROT_updateProteins', NOW( ) )" ) Database.db.commit( ) sys.exit( )
def myopen(fname, mode='r'): if fname[-2:] == 'gz': from gzip import open as gopen return gopen(fname, mode) else: return open(fname, mode)
# Parses UNIPROT_SWISSPROT file and pull out only accessions # into staging area table import sys, string import MySQLdb import Database import Config from classes import UniprotAccessionXMLParser from xml.sax import parse from gzip import open as gopen with Database.db as cursor : cursor.execute( "TRUNCATE TABLE " + Config.DB_STAGING + ".swissprot_ids" ) Database.db.commit( ) with gopen( Config.UP_SWISSPROT ) as swissprot : parse( swissprot, UniprotAccessionXMLParser.UniprotAccessionXMLParser( ) ) sys.exit( )
def cleanCopyDocuments(f_align_p, f_corpus, f_clean, f_stats, f_rem, filter=True): """ Copies the documents with alignment in a clean format to a new folder as text files. """ align_p_f = loadStruct(f_align_p) stopwords = getStopwords() n_docs = len(align_p_f) words_total = 0 words_lost = 0 sents_lost = 0 with open(f_rem, "w") as rem_f: for i, key in enumerate(align_p_f.iterkeys()): if i % 500 == 0: print "Documents: %d/%d" % (i, n_docs) elif i == 0 or i == n_docs - 1: print "Documents: %d/%d" % (i + 1, n_docs) for lang in key: fname = f_clean + lang.replace(".gz", "") createPath(fname) with copen(fname, "w", encoding="utf-8") as xml_f: doc = [] last_id = 0 words = 0 with gopen(f_corpus + lang) as clean_f: for line in clean_f: line = line.strip() if line.startswith("<s"): last_id = match('.*id="([0-9]+)"', line).group(1) doc.append([]) if line.startswith("<w"): m = match(".*>(.+)</", line) if m: word = m.group(1) words += 1 if lang.startswith("en"): words_total += 1 word = word.strip().lower().replace("'", "") if filter and word not in stopwords and len(word) > 1 and word.isalpha(): doc[-1].append(word) elif not filter: doc[-1].append(word) else: words_lost += 1 elif lang.startswith("de"): doc[-1].append(word) xml_f.write( '<?xml version="1.0" encoding="utf-8"?>\n<d s="%s" w="%s" f="%s">\n' % (last_id, words, lang.replace(".gz", "")) ) for k, v in enumerate(doc): sid = k + 1 if len(v) > 1: xml_f.write('<s id="%s">%s</s>\n' % (sid, " ".join(v).decode("utf-8"))) if len(v) <= 1: sents_lost += 1 rem_f.write("[R] %s %s %s\n" % (str(key), lang[0:2], sid)) for projection in align_p_f[key]: if lang.startswith("de") and str(sid) in projection[0].split(" "): align_p_f[key].remove(projection) break elif lang.startswith("en") and str(sid) in projection[1].split(" "): align_p_f[key].remove(projection) break xml_f.write("</d>\n") xml_f.flush() with open(f_stats, "a") as stats_f: stats_f.write("Removed: %d sentences\n" % sents_lost) scount = 0 for v in align_p_f.itervalues(): scount += len(v) stats_f.write("Remaining: %d sentences\n" % scount) stats_f.write("Total words: %d\n" % words_total) stats_f.write("Words lost: %d\n" % words_lost) stats_f.write("Words remmaining: %d\n" % (words_total - words_lost)) dumpStruct(f_align_p, align_p_f)
from gzip import open as gopen from classes import UniprotKB descRE = re.compile( '^([A-Z0-9]+_{1}[A-Z0-9]+) (.*?) (OS=.*?)? (GN=(.*?))?$', re.VERBOSE ) with Database.db as cursor : cursor.execute( "UPDATE " + Config.DB_NAME + ".uniprot_isoforms SET uniprot_isoform_status='inactive'" ) Database.db.commit( ) uniprotKB = UniprotKB.UniprotKB( Database.db, cursor ) accessionHash = uniprotKB.buildAccessionHash( ) organismHash = uniprotKB.buildOrganismHash( ) with gopen( Config.UP_ISOFORMS ) as file : currentInfo = { } for line in file.readlines( ) : line = line.strip( ) # Skip Blank Lines if len( line ) <= 0 : continue if ">" == line[0] : if len( currentInfo ) > 0 : if currentInfo["ACCESSION"] in accessionHash : uniprotID = accessionHash[currentInfo["ACCESSION"]]
def quality_plot(fnam, r_enz=None, nreads=float('inf'), axe=None, savefig=None, paired=False): """ Plots the sequencing quality of a given FASTQ file. If a restrinction enzyme (RE) name is provided, can also represent the distribution of digested and undigested RE sites and estimate an expected proportion of dangling-ends. Proportion of dangling-ends is inferred by counting the number of times a dangling-end site, is found at the beginning of any of the reads (divided by the number of reads). :param fnam: path to FASTQ file :param None nreads: max number of reads to read, not necesary to read all :param None savefig: path to a file where to save the image generated; if None, the image will be shown using matplotlib GUI (the extension of the file name will determine the desired format). :param False paired: is input FASTQ contains both ends :returns: the percentage of dangling-ends (sensu stricto) and the percentage of reads with at least a ligation site. """ phred = dict([(c, i) for i, c in enumerate( '!"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~')]) if isinstance(r_enz, list): r_enzs = r_enz elif isinstance(r_enz, str): r_enzs = [r_enz] for k in RESTRICTION_ENZYMES.keys(): for i in range(len(r_enzs)): if k.lower() == r_enz[i].lower(): r_enz[i] = k # else let it as None quals = [] henes = [] sites = {} fixes = {} liges = {} ligep = {} tkw = dict(size=4, width=1.5) if fnam.endswith('.gz'): fhandler = gopen(fnam) elif fnam.endswith('.dsrc'): proc = Popen(['dsrc', 'd', '-t8', '-s', fnam], stdout=PIPE) fhandler = proc.stdout else: fhandler = open(fnam) if not r_enzs: if nreads: while True: try: next(fhandler) except EOFError: break seq = next(fhandler) if 'N' in seq: henes.extend([i for i, s in enumerate(seq) if s == 'N']) next(fhandler) line = next(fhandler) quals.append([phred[i] for i in line.strip()]) if len(quals) > nreads: break else: # do this because it's faster while True: try: next(fhandler) except EOFError: break seq = next(fhandler) if 'N' in seq: henes.extend([i for i, s in enumerate(seq) if s == 'N']) next(fhandler) line = next(fhandler) quals.append([phred[i] for i in line.strip()]) else: r_sites = {} d_sites = {} for r_enz in r_enzs: r_sites[r_enz] = RESTRICTION_ENZYMES[r_enz].replace('|', '') d_sites[r_enz] = repaired(r_enz) sites[r_enz] = [] # initialize dico to store undigested sites fixes[r_enz] = [] # initialize dico to store digested sites l_sites = religateds(r_enzs) site = {} fixe = {} for r_enz in r_enzs: site[r_enz] = re.compile(r_sites[r_enz]) fixe[r_enz] = re.compile(d_sites[r_enz]) # ligation sites should appear in lower case in the sequence lige = {} for k in l_sites: liges[k] = [] # initialize dico to store sites ligep[k] = 0 # initialize dico to store sites l_sites[k] = l_sites[k].lower() lige[k] = re.compile(l_sites[k]) while len(quals) <= nreads: try: next(fhandler) except StopIteration: break seq = next(fhandler) # ligation sites replaced by lower case to ease the search for lig in l_sites.values(): seq = seq.replace(lig.upper(), lig) for r_enz in r_enzs: sites[r_enz].extend([m.start() for m in site[r_enz].finditer(seq)]) # TODO: you cannot have a repaired/fixed site in the middle of # the sequence, this could be only checked at the beginning fixes[r_enz].extend([m.start() for m in fixe[r_enz].finditer(seq)]) for k in lige: # for each paired of cut-site liges[k].extend([m.start() for m in lige[k].finditer(seq)]) ligep[k] += l_sites[k] in seq # store the number of Ns found in the sequences if 'N' in seq: henes.extend([i for i, s in enumerate(seq) if s == 'N']) next(fhandler) line = next(fhandler) quals.append([phred[i] for i in line.strip()]) fhandler.close() if not nreads: nreads = len(quals) quals = izip_longest(*quals, fillvalue=float('nan')) meanquals, errorquals = zip(*[(nanmean(q), nanstd(q)) for q in quals]) max_seq_len = len(meanquals) if axe: ax = axe fig = axe.get_figure() ax2 = fig.add_subplot(212) else: # configure plot if r_enz: # do both plots _, (ax, ax2) = plt.subplots(2,1, figsize=(15, 12)) else: # only do the quality_plot plot _, ax = plt.subplots(1,1, figsize=(15, 6)) ax.patch.set_facecolor('lightgrey') ax.patch.set_alpha(0.4) ax.grid(ls='-', color='w', lw=1.5, alpha=0.6, which='major') ax.grid(ls='-', color='w', lw=1, alpha=0.3, which='minor') ax.set_axisbelow(True) # remove tick marks ax.tick_params(axis='both', direction='out', top=False, right=False, left=False, bottom=False) ax.tick_params(axis='both', direction='out', top=False, right=False, left=False, bottom=False, which='minor') ax.errorbar(range(max_seq_len), meanquals, linewidth=1, elinewidth=1, color='darkblue', yerr=errorquals, ecolor='orange') ax.set_xlim((0, max_seq_len)) ax.set_xlabel('Nucleotidic position') ax.set_ylabel('PHRED score') ax.set_title('Sequencing Quality (%d reads)' % (nreads)) ax.yaxis.label.set_color('darkblue') ax.tick_params(axis='y', colors='darkblue', **tkw) axb = ax.twinx() # quality_plot plot axb.plot([henes.count(i) for i in xrange(max_seq_len)], linewidth=1, color='black', linestyle='--') axb.yaxis.label.set_color('black') axb.tick_params(axis='y', colors='black', **tkw) axb.set_ylabel('Number of "N" per position') try: # no Ns found (yes... it happens) axb.set_yscale('log') axb.set_ylim((0, axb.get_ylim()[1] * 1000)) except ValueError: axb.set_yscale('linear') ax.set_ylim((0, ax.get_ylim()[1])) ax.set_xlim((0, max_seq_len)) # Hi-C plot if r_enzs: ax.set_title('Sequencing Quality and deconvolution (%s %d reads)' % ( ', '.join(r_enzs), nreads)) ax.set_xlabel('') plt.setp(ax.get_xticklabels(), visible=False) ax2.patch.set_facecolor('lightgrey') ax2.patch.set_alpha(0.4) ax2.grid(ls='-', color='w', lw=1.5, alpha=0.6, which='major') ax2.grid(ls='-', color='w', lw=1, alpha=0.3, which='minor') ax2.set_axisbelow(True) ax2.set_xlabel('Nucleotidic position') # seq_len is the length of the line to plot. we don't want to plot # if there is no room for the cut-site, or ligation site. site_len = max((max([len(r_sites[k]) for k in r_sites]), max([len(l_sites[k]) for k in l_sites]), max([len(d_sites[k]) for k in d_sites]))) seq_len = max_seq_len - site_len # transform dictionaries of positions into dictionaries of counts for r_enz in sites: sites[r_enz] = [sites[r_enz].count(k) for k in xrange(seq_len)] # Undigested fixes[r_enz] = [fixes[r_enz].count(k) for k in xrange(seq_len)] # DE for r1, r2 in liges: liges[(r1, r2)] = [liges[(r1, r2)].count(k) for k in xrange(seq_len)] # OK # in case the pattern of the repaired cut-site contains the target # cut-site pattern. These sites were counted twice, once in the # undigested, and once in the repaired. We remove them from the # repaired: for r_enz in r_enzs: if d_sites[r_enz] in r_sites[r_enz]: pos = r_sites[r_enz].find(d_sites[r_enz]) fixes[r_enz] = (fixes[r_enz][:pos] + [fixes[r_enz][k] - sites[r_enz][k-pos] for k in xrange(pos, seq_len)]) # same for ligated sites for r_enz1 in r_enzs: for r_enz2 in r_enzs: if d_sites[r_enz1] not in l_sites[(r_enz1, r_enz2)]: continue pos = l_sites[(r_enz1, r_enz2)].find(d_sites[r_enz1]) fixes[r_enz1] = (fixes[r_enz1][:pos] + [fixes[r_enz1][k] - liges[(r_enz1, r_enz2)][k - pos] for k in xrange(pos, seq_len)]) # remove anything that could be in between the two read ends if paired: for k in sites: sites[k][max_seq_len / 2 - site_len: max_seq_len / 2] = [float('nan')] * site_len fixes[k][max_seq_len / 2 - site_len: max_seq_len / 2] = [float('nan')] * site_len for k in liges: liges[k][max_seq_len / 2 - site_len: max_seq_len / 2] = [float('nan')] * site_len # plot undigested cut-sites color = iter(plt.cm.Reds(linspace(0.3, 0.95, len(r_enzs)))) for r_enz in sites: # print 'undigested', r_enz # print sites[r_enz][:20] ax2.plot(sites[r_enz], linewidth=2, color = color.next(), alpha=0.9, label='Undigested RE site (%s: %s)' % (r_enz, r_sites[r_enz]) if any([f > 0 for f in fixes[r_enz]]) else 'Undigested & Dangling-Ends (%s: %s)' % (r_enz, r_sites[r_enz])) ax2.set_ylabel('Undigested') ax2.yaxis.label.set_color('darkred') ax2.tick_params(axis='y', colors='darkred', **tkw) lines, labels = ax2.get_legend_handles_labels() ax3 = ax2.twinx() color = iter(plt.cm.Blues(linspace(0.3, 0.95, len(liges)))) for r1, r2 in liges: # print 'ligated', r1, r2 # print liges[(r1, r2)][:20] ax3.plot(liges[(r1, r2)], linewidth=2, color=color.next(), alpha=0.9, label = 'Ligated (%s-%s: %s)' % (r1, r2, l_sites[(r1, r2)].upper())) ax3.yaxis.label.set_color('darkblue') ax3.tick_params(axis='y', colors='darkblue', **tkw) ax3.set_ylabel('Ligated') tmp_lines, tmp_labels = ax3.get_legend_handles_labels() lines.extend(tmp_lines) labels.extend(tmp_labels) color = iter(plt.cm.Greens(linspace(0.3, 0.95, len(r_enzs)))) for i, r_enz in enumerate(r_enzs): if any([f > 0 for f in fixes[r_enz]]): ax4 = ax2.twinx() ax4.spines["right"].set_position(("axes", 1.07)) make_patch_spines_invisible(ax4) ax4.spines["right"].set_visible(True) # print 'repaired', r_enz # print fixes[r_enz][:20] ax4.plot(fixes[r_enz], linewidth=2, color=color.next(), alpha=0.9, label='Dangling-ends (%s: %s)' % (r_enz, d_sites[r_enz])) ax4.yaxis.label.set_color('darkgreen') ax4.tick_params(axis='y', colors='darkgreen', **tkw) ax4.set_ylabel('Dangling-ends') tmp_lines, tmp_labels = ax4.get_legend_handles_labels() lines.extend(tmp_lines) labels.extend(tmp_labels) else: ax2.set_ylabel('Undigested & Dangling-ends') ax2.set_xlim((0, max_seq_len)) # Count ligation sites lig_cnt = {} for k in liges: lig_cnt[k] = (nansum(liges[k]) - liges[k][0] - liges[k][max_seq_len / 2]) # Count undigested sites sit_cnt = {} for r_enz in r_enzs: sit_cnt[r_enz] = (nansum(sites[r_enz]) - sites[r_enz][0] - sites[r_enz][max_seq_len / 2]) # Count Dangling-Ends des = {} for r_enz in r_enzs: if any([f > 0 for f in fixes[r_enz]]): des[r_enz] = ((100. * (fixes[r_enz][0] + (fixes[r_enz][(max_seq_len / 2)] if paired else 0))) / nreads) else: des[r_enz] = (100. * (sites[r_enz][0] + (sites[r_enz][(max_seq_len / 2)] if paired else 0))) / nreads # Decorate plot title = '' for r_enz in r_enzs: lcnt = float(sum([lig_cnt[(r_enz1, r_enz2)] * (2 if r_enz1 == r_enz2 else 1) for r_enz1 in r_enzs for r_enz2 in r_enzs if r_enz1 == r_enz or r_enz2 == r_enz])) title += ('Percentage of digested sites (not considering Dangling-Ends) ' '%s: %.1f%%\n' % (r_enz, 100. * float(lcnt) / (lcnt + sit_cnt[r_enz]))) for r_enz in r_enzs: title += 'Percentage of dangling-ends %s: %.1f%%\n' % (r_enz, des[r_enz]) for r_enz1 in r_enzs: for r_enz2 in r_enzs: title += ('Percentage of reads with ligation site (%s-%s): %.1f%% \n' % (r_enz1, r_enz2, (ligep[(r_enz1, r_enz2)] * 100.) / nreads)) plt.title(title.strip(), size=10, ha='left', x=0) plt.subplots_adjust(right=0.85) ax2.legend(lines, labels, bbox_to_anchor=(0.75, 1.0), loc=3, borderaxespad=0., frameon=False, fontsize=9) plt.tight_layout() if savefig: tadbit_savefig(savefig) plt.close('all') elif not axe: plt.show() for k in ligep: ligep[k] = (ligep[k] * 100.) / nreads return des, ligep