def test_guess_format(): 'It test that we can guess the format for the sequence files' fhand = StringIO.StringIO('>fasta\nACTAG\n') assert guess_seq_file_format(fhand) == 'fasta' fhand = StringIO.StringIO('LOCUS AX0809\n') assert guess_seq_file_format(fhand) == 'genbank' fhand = StringIO.StringIO('@fastq\nACTAG\n') fhand.name = 'hola.sfastq' assert guess_seq_file_format(fhand) == 'fastq' fhand = StringIO.StringIO('@fastq\nACTAG\n+\nAt+AA') fhand.name = 'hola.fastq' assert guess_seq_file_format(fhand) == 'fastq'
def scrape_info_from_fname(path): 'It guess pipeline taking into account the platform and the file format' if isinstance(path, basestring): fpath = path else: fpath = path.last_version fhand = open(fpath) basename = os.path.splitext(os.path.basename(fpath))[0] file_info = {} file_info['format'] = guess_seq_file_format(fhand) fhand.close() for item in basename.split('.'): if len(item) < 3 or item[2] != '_': continue key, value = item.split('_', 1) if key == 'pl': value = value.lower() file_info[key] = value file_info['fpath'] = path if file_info['pl'] not in ACCEPTED_PLATFORMS: msg = "The platform of your file({0:s}) is not".format(file_info['pl']) msg += 'in the accepted ones {0:s}'.format(ACCEPTED_PLATFORMS) raise RuntimeError(msg) return file_info
def create_cdna_intron_annotator(genomic_db, genomic_seqs_fhand): 'It creates a function that annotates introns in cdna matching with genomic' genomic_seqs_fhand = get_fhand(genomic_seqs_fhand) genomic_seqs_index = SeqIO.index(genomic_seqs_fhand.name, guess_seq_file_format(genomic_seqs_fhand)) def annotate_intron(sequence): 'It adds the orf to the SeqFeatures' if sequence is None: return try: introns = infer_introns_for_cdna(sequence=sequence, genomic_db=genomic_db, genomic_seqs_index=genomic_seqs_index) except KeyError as error: error = str(error).lstrip('u').strip("'") if 'not found' in error: error += ' in seq file %s, but present in blast db %s' % \ (genomic_seqs_fhand.name, genomic_db) raise RuntimeError(error) for intron_pos in introns: feature = SeqFeature(location=FeatureLocation(intron_pos, intron_pos), type='intron', qualifiers={'genomic_db':genomic_db}) sequence.features.append(feature) return sequence return annotate_intron
def make_backbone_blast_db(project_dir, blast_db_seq, dbtype): 'It formats a blastdb when need it' logger = logging.getLogger(LOGGER_NAME) #the name should be the basename of the blast_db_seq db_dir = join(project_dir, BACKBONE_DIRECTORIES['blast_databases']) if not exists(db_dir): makedirs(db_dir) db_seq_fpath = join(db_dir, _get_basename(blast_db_seq)) if not exists(db_seq_fpath): #which is the name of the new databae? blast_db_seq_format = guess_seq_file_format(open(blast_db_seq)) if blast_db_seq_format == 'fasta': rel_symlink(blast_db_seq, db_seq_fpath) else: seqio(in_seq_fhand=open(blast_db_seq), out_seq_fhand=open(db_seq_fpath, 'w'), out_format='fasta') logger.info('Formatting the database %s' % db_seq_fpath) try: makeblastdb_plus(db_seq_fpath, dbtype=dbtype) except RuntimeError: msg = 'Error making blastdb. db:%s\n dbtype:%s\n' % \ (db_seq_fpath, dbtype) remove(db_seq_fpath) raise RuntimeError(msg) return db_seq_fpath
def seq_pipeline_runner(pipeline, configuration, in_fhands, file_format=None, writers=None, processes=False): '''It runs all the analysis for the given sequence pipeline. It takes one or two input files and one or two output files. (Fasta files with the sequence and quality). A working directory can be given in which the analysis intermediate files will be created. If not given a temporary directory will be created that will be removed once the analysis is completed. If the checkpoints are requested an intermediate file for every step will be created. ''' if isinstance(pipeline, str): pipeline = PIPELINES[pipeline] if file_format is None: file_format = guess_seq_file_format(in_fhands['in_seq']) # Here we extract our input/output files in_fhand_seqs = in_fhands['in_seq'] if 'in_qual' in in_fhands: in_fhand_qual = in_fhands['in_qual'] else: in_fhand_qual = None # Here the SeqRecord generator is created processes = None if processes == 1 else processes if processes: temp_out_fhand = NamedTemporaryFile() temp_out_fpath = temp_out_fhand.name sequences = _parallel_process_sequences(in_fhand_seqs, in_fhand_qual, file_format, pipeline, configuration, processes, temp_out_fpath) else: temp_out_fhand = None sequences = _process_sequences(in_fhand_seqs, in_fhand_qual, file_format, pipeline, configuration) # The SeqRecord generator is consumed for sequence in sequences: for writer in writers.values(): writer.write(sequence) # close and remove the temporary files if temp_out_fhand is not None: temp_out_fhand.close() # Some of the writers needs to close in order to finish its work feature_counter = {} for wtype, writer in writers.items(): if 'close' in dir(writer): writer.close() feature_counter[wtype] = writer.num_features return feature_counter
def __init__(self, reference_fhand, reads_fhand, output_fhand, keep_unmapped=True): "the initiator" self._reference_fhand = reference_fhand self._output_fhand = output_fhand self._write_header() self._keep_unmapped = keep_unmapped format_ = guess_seq_file_format(reads_fhand) self._read_index = SeqIO.index(reads_fhand.name, format=format_)
def backbone_blast_runner(query_fpath, project_dir, blast_program, blast_db=None, blast_db_seq=None, dbtype='nucl', threads=False): '''It returns the blast if the results doesn't exist''' if blast_db is None and blast_db_seq is None: raise RuntimeError('It needs a blast database or seqfile') #create a logger logger = logging.getLogger(LOGGER_NAME) query_basename = _get_basename(query_fpath) blast_dir = join(project_dir, BACKBONE_DIRECTORIES['blast_dir']) if blast_db: result_dir = join(blast_dir, query_basename, _get_basename(blast_db)) else: result_dir = join(blast_dir, query_basename, _get_basename(blast_db_seq)) if not exists(result_dir): makedirs(result_dir) result_fpath = join(result_dir, '%s.%s.xml' % (BACKBONE_BASENAMES['blast_basename'], blast_program)) if exists(result_fpath): logger.info('Using the stored blast result %s' % result_fpath) return result_fpath #the input file should be fasta fasta_query_fhand = None fasta_db_fhand = None if guess_seq_file_format(open(query_fpath)) != 'fasta': fasta_query_fhand = _create_temp_fasta_file(query_fpath) query_fpath = fasta_query_fhand.name #we have to create a database in BACKBONE_DIRECTORIES['blast_databases'] if blast_db_seq: blast_db = make_backbone_blast_db(project_dir, blast_db_seq, dbtype) logger.info('Running the blast %s' % result_fpath) try: blast_runner_plus(query_fpath, blast_db, blast_program, result_fpath, threads=threads) except RuntimeError as error: if exists(result_fpath): remove(result_fpath) msg = '%s \n database: %s\n database type: %s' % (str(error), blast_db, dbtype) raise RuntimeError(msg) if fasta_query_fhand: fasta_query_fhand.close() if fasta_db_fhand: fasta_db_fhand.close() return result_fpath
def main(): 'The main' # get parameters infhand, outfhand, rm_annots = set_parameters() # guess file format format_ = guess_seq_file_format(infhand) #remove annotations seqs = remove_annotation(infhand, format_, rm_annots) # write seqs in file write_seqs_in_file(seqs, seq_fhand=outfhand, format=format_)
def run(self): '''It runs the analysis. It checks if the analysis is already done per input file''' self._log({'analysis_started':True}) files_illumina = [] files_454 = [] files_sanger_with_qual = [] files_sanger_without_qual = [] for path in self._get_input_fpaths()['reads']: fpath = path.last_version fhand = open(fpath) fname = os.path.split(fpath)[-1] if 'pl_454' in fname.lower(): files_454.append(fhand) if 'pl_illumina' in fname.lower(): files_illumina.append(fhand) elif 'pl_sanger' in fname.lower(): format_ = guess_seq_file_format(fhand) if format_ == 'fasta': files_sanger_without_qual.append(fhand) elif format_ == 'fastq': files_sanger_with_qual.append(fhand) #fastq are processed before files_sanger = files_sanger_with_qual[:] files_sanger.extend(files_sanger_without_qual) #all files should be fasta and fasta.qual output_dir = self._create_output_dirs()['assembly_input'] project_name = self._get_project_name() for ext, files in (('_in.454', files_454), ('_in.sanger', files_sanger), ('_in.illumina', files_illumina),): base_name = os.path.join(output_dir, project_name + ext) fasta_fpath = base_name + '.fasta' qual_fpath = base_name + '.fasta.qual' if os.path.exists(fasta_fpath) or not files: continue fasta_fhand = open(fasta_fpath, 'w') qual_fhand = open(qual_fpath, 'w') self._cat_to_fasta(files, fasta_fhand, qual_fhand) fasta_fhand.close() qual_fhand.close() # close all files for file_ in files_454 + files_sanger + files_illumina: file_.close() self._log({'analysis_finished':True})
def scrape_info_from_fname(path): "It guess pipeline taking into account the platform and the file format" if isinstance(path, basestring): fpath = path else: fpath = path.last_version fhand = open(fpath) basename = os.path.splitext(os.path.basename(fpath))[0] file_info = {} file_info["format"] = guess_seq_file_format(fhand) fhand.close() for item in basename.split("."): if len(item) < 3 or item[2] != "_": continue key, value = item.split("_", 1) file_info[key] = value file_info["fpath"] = path return file_info
def seqio(in_seq_fhand, out_seq_fhand, out_format, double_encoding=False, in_qual_fhand=None, out_qual_fhand=None, in_format=None): 'It converts format of the files' if not in_format: in_format = guess_seq_file_format(in_seq_fhand) if (in_qual_fhand is not None or out_qual_fhand is not None or in_format in ('repr', 'json', 'pickle') or out_format in ('repr', 'json', 'pickle')) : seqs = seqs_in_file(seq_fhand=in_seq_fhand, qual_fhand=in_qual_fhand, format=in_format, double_encoding=double_encoding) write_seqs_in_file(seqs, seq_fhand=out_seq_fhand, qual_fhand=out_qual_fhand, format=out_format) else: SeqIO.convert(in_seq_fhand, in_format, out_seq_fhand, out_format) out_seq_fhand.flush() if out_qual_fhand: out_qual_fhand.flush()
def create_unique_contiguous_region_filter(distance, genomic_db, genomic_seqs_fpath): '''It returns a filter that removes snv in a region that give more than one match or more than one match_parts''' parameters = {'database': genomic_db} blast_runner = create_runner(tool='blastn', parameters=parameters) blast_parser = get_alignment_parser('blast') match_filters = [{'kind' : 'score_threshold', 'score_key': 'similarity', 'min_score': 90, }, {'kind' : 'min_length', 'min_num_residues': 20, 'length_in_query' : True } ] if not genomic_seqs_fpath: msg = 'No genomic sequence file defined for unique SNV filter' raise ValueError(msg) if not genomic_db: msg = 'No genomic blast database defined for unique SNV filter' raise ValueError(msg) genomic_seqs_fhand = open(genomic_seqs_fpath) genomic_seqs_index = SeqIO.index(genomic_seqs_fhand.name, guess_seq_file_format(genomic_seqs_fhand)) def unique_contiguous_region_filter(sequence): '''It filters out the snv in regions repeated in the genome or discontiguous''' if sequence is None: return None for snv in sequence.get_features(kind='snv'): # Check if it is already done previous_result = _get_filter_result(snv, 'uniq_contiguous', threshold=distance) if previous_result is not None: continue #we make a blast #with the sequence around the snv location = snv.location.start.position start = location - distance end = location + distance if start < 0: start = 0 #print start, end seq_fragment = sequence[start:end] blast_fhand = blast_runner(seq_fragment)['blastn'] #now we parse the blast blast_result = blast_parser(blast_fhand) alignments = filter_alignments(blast_result, config=match_filters) #are there any similar sequences? try: alignment = alignments.next() result = True except StopIteration: #if there is no similar sequence we assume that is unique result = False if result: #how many matches, it should be only one num_hits = len(alignment['matches']) if num_hits > 1: result = True else: #how many match parts have the first match? #we could do it with the blast result, but blast is not very #good aligning, so we realign with est2genome blast_fhand.seek(0) sim_seqs = similar_sequences_for_blast(blast_fhand) sim_seq = sim_seqs[0] if sim_seqs else None introns = infer_introns_for_cdna(sequence=seq_fragment, genomic_seqs_index=genomic_seqs_index, similar_sequence=sim_seq, genomic_db=genomic_db) if introns: result = True else: result = False blast_fhand.close() _add_filter_result(snv, 'uniq_contiguous', result, distance) return sequence return unique_contiguous_region_filter
def test_staticmethod(): 'If an empty file is given it should not fail' fhand = StringIO.StringIO() assert guess_seq_file_format(fhand) is None