def stream_fa(sequence_file: str) -> Generator[FastA, None, None]: ''' Read a fastq file either gzipped or not and return it as a stream of tuples (Header, Sequence, Quality) :param infile: :return: Generator[FastA, None, None] ''' if sequence_file.endswith('fq.gz') or sequence_file.endswith('fastq.gz'): with gzip.open(sequence_file, 'rt') as handle: for header, sequence, qual in Bio.SeqIO.QualityIO.FastqGeneralIterator( handle): yield FastA(header, sequence) elif sequence_file.endswith('fq') or sequence_file.endswith('fastq'): with open(sequence_file) as handle: for header, sequence, qual in Bio.SeqIO.QualityIO.FastqGeneralIterator( handle): yield FastA(header, sequence) elif sequence_file.endswith('fasta.gz') or sequence_file.endswith('fa.gz'): with gzip.open(sequence_file, 'rt') as handle: for (header, sequence) in FastaIO.SimpleFastaParser(handle): yield FastA(header, sequence) elif sequence_file.endswith('fasta') or sequence_file.endswith('fa'): with open(sequence_file) as handle: for (header, sequence) in FastaIO.SimpleFastaParser(handle): yield FastA(header, sequence) else: raise Exception(f'{sequence_file} not a sequence file.')
def stream_fa(infile): if infile.endswith('fasta.gz') or infile.endswith('fa.gz'): with gzip.open(infile, 'rt') as handle: for (header, sequence) in FastaIO.SimpleFastaParser(handle): yield (header, sequence) elif infile.endswith('fasta') or infile.endswith('fa'): with open(infile, 'rt') as handle: for (header, sequence) in FastaIO.SimpleFastaParser(handle): yield (header, sequence) else: raise Exception(f'{infile} not a sequence file.')
def openFasta(path): """ open fasta as simple dict (refname is trimmed after the first space)""" from Bio.SeqIO import FastaIO with open(path) as handle: # trim after the first space (as in ref in bam file) return { item[0].split()[0]: item[1] for item in dict(FastaIO.SimpleFastaParser(handle)).items() }
def get_proteins(path): genes = set() with open(path) as handle: for title, seq in fio.SimpleFastaParser(handle): genes.add(title) genes = res.get_unified_names(genes) return genes
def all_sequence_names_from_fasta_file(input_fasta_file_name): """Returns all sequence names from a fasta file. Args: input_fasta_file_name: string. Returns: list of string. """ with tf.io.gfile.GFileText(input_fasta_file_name) as input_file: return [ get_sequence_name_from(protein_name_incl_family) for protein_name_incl_family, _ in FastaIO.SimpleFastaParser(input_file) ]
def filter_fasta_file_by_sequence_name(input_fasta_file_name, acceptable_sequence_names): """Yield only entries from a fasta file that are in acceptable_sequence_names. Args: input_fasta_file_name: string. This file should contain fasta entries that are formatted seqName_actualFamily, as above. acceptable_sequence_names: iterable of string. This set just seqName (no actualFamily, as with `input_fasta_file_name`). Yields: strings, each of which is an entry for a fasta file. """ acceptable_sequence_names = set(acceptable_sequence_names) with tf.io.gfile.GFileText(input_fasta_file_name) as input_file: for protein_name, sequence in FastaIO.SimpleFastaParser(input_file): if get_sequence_name_from(protein_name) in acceptable_sequence_names: yield '>' + protein_name + '\n' + sequence + '\n'
def fasta_to_list(self, fasta_fn): ### convert .fasta file into a list filepath = Path.cwd() / fasta_fn # original list format: [(name, seq)] fasta_list = list(FastaIO.SimpleFastaParser(open(filepath))) # converting to list of lists (instead of list of tuples) fasta_list = list(map(list, fasta_list)) # replace underscores for spaces and upper all chars for fasta_idx in range(0, len(fasta_list)): fasta_list[fasta_idx][0] = fasta_list[fasta_idx][0].replace( '_', ' ') fasta_list[fasta_idx][1] = fasta_list[fasta_idx][1].upper() # check if all sequences have the same size sizes_list = [] for fasta_idx in range(0, len(fasta_list)): sizes_list.append(len(fasta_list[fasta_idx][1])) if not all(x == sizes_list[0] for x in sizes_list): logging.getLogger().info( f'WARNING: .fasta file has inconsistency: {fasta_fn}') return fasta_list
def write_phmmer_predictions(train_sequence_file, test_sequence_file, parsed_output): """Write prediction csv file for all files in test_sequence_dir. The csv content is: sequence_name,true_label,predicted_label Where sequence_name is the uniprot identifier, including domain indices, and true and predicted label are pfam family accession ids. Args: train_sequence_file: string. Filename of fasta file of unaligned training sequences. test_sequence_file: string. Fasta files of unaligned test sequences. parsed_output: string. csv file for parsed phmmer outputs. """ logging.info('Writing predictions to %s', parsed_output) with tf.io.gfile.GFile(test_sequence_file, 'r') as input_file: batched_fasta_iterable = pfam_utils.batch_iterable( FastaIO.SimpleFastaParser(input_file), _BLOCK_SIZE) input_dict_to_phmmer_function = [ dict( train_sequence_file=train_sequence_file, list_of_protein_name_and_sequence=list_of_protein_name_and_sequence) for list_of_protein_name_and_sequence in batched_fasta_iterable ] results = parallel.RunInParallel( run_phmmer_for_query, input_dict_to_phmmer_function, _THREADS, cancel_futures=True) with tf.io.gfile.GFile(parsed_output, 'w') as parsed_output_file: for phmmer_query_result in results: for phmmer_output in phmmer_query_result: parsed_output_file.write(phmmer_output.format_as_csv() + '\n')
def openFasta(path): """ open fasta as simple dict """ from Bio.SeqIO import FastaIO with open(path) as handle: return dict(FastaIO.SimpleFastaParser(handle))
def preprocess_msa_refs(self, ref_fname, s0_h, cons, min_d=None): """ Preprocess MSA references by cutting out any insertions relative to s0. Args: ref_fname: MSA fasta file name. s0_h: name of header for s0. min_d: minimum distance to s0 refs can be. Return: A list of references aligned to s0. """ # Parse refs with open(ref_fname) as f: recs = [(h,str_only_ACGTgap(s.upper())) for h,s in FastaIO.SimpleFastaParser(f)] # Pull out s0 msa sequence s0_msa_seq = "" for h,s in recs: if h == s0_h: s0_msa_seq = s break # Cut out any gaps from s0 to get s0_seq s0_seq = s0_msa_seq.replace("-","") assert len(s0_msa_seq) > 0, "Reference %s not found in msa" % s0_h assert len(s0_seq) == len(cons), (len(s0_seq), len(cons)) # Pull out indices that are not gaps in s0 nongapinds = [j for j,c in enumerate(s0_msa_seq) if c != "-"] # Remove indels relative to s0 recs2 = [] for h,s in recs: assert len(s) == len(s0_msa_seq) s2 = "" for i in nongapinds: # If at one of these indices, it is an ambiguous base, coerce to be the same as ref if s[i] in "ACGT": s2 += s[i] else: s2 += s0_msa_seq[i] assert len(s2) == len(s0_seq) recs2.append((h, str_c2i(s2))) # Check, for all indicecs that are "N", s0 is also "N" for h, s in recs2: for ci, c in enumerate(s): if c == 4: assert s0_seq[ci] == "N" # If this is the required distance from cons recs2_final = [] if min_d is not None: for h, s in recs2: d = ham_nogaps(cons, s) if d >= min_d: recs2_final.append((h, s)) else: recs2_final = recs2 return s0_seq, recs2_final
def call_moods(one_motif, genome, output_directory, p_value, moods_bg, condition2, condition1, control_dict, overexpression_dict, differences, which_score): #check if this is a bigwig file bw_condition2 = pyBigWig.open(condition2) bw_condition1 = pyBigWig.open(condition1) if not bw_condition2.isBigWig() or not bw_condition1.isBigWig(): logger.info("please provide the bigwig file!") sys.exit() else: # prepare everything for moods # setting standard parameters for moods # this code was token and modified from gitHub MOODS page pseudocount = 0.0001 bg = tuple(moods_bg) matrix_names = [os.path.basename(one_motif)] matrices = [] matrices_rc = [] valid, matrix = pfm_to_log_odds(one_motif, bg, pseudocount) key_for_bed_dict = '' if valid: matrices.append(matrix) matrices_rc.append(MOODS.tools.reverse_complement(matrix,4)) matrices_all = matrices + matrices_rc thresholds = [MOODS.tools.threshold_from_p(m, bg, p_value, 4) for m in matrices_all] scanner = MOODS.scan.Scanner(7) scanner.set_motifs(matrices_all, bg, thresholds) with open(genome) as handle: seq_iterator = bio.SimpleFastaParser(handle) for header, seq in seq_iterator: header_splitted = re.split(r':', header) if len(header_splitted) == 1: #if there are no positions given header = header + ":0-" #set the first position as 0 and split it once more header_splitted = re.split(r':', header) logger.info("moods works with " + header) else: #the given genome file is a file with peaks, so use the header of the peak as a key to search in the bed dictionary for additional information later on key_for_bed_dict = header chromosom = header_splitted[0] positions = re.split(r'-', header_splitted[-1]) results = scanner.scan(seq) fr = results[:len(matrix_names)] #forward strand rr = results[len(matrix_names):] #reverse strand results = [[(r.pos, r.score, '+', ()) for r in fr[i]] + [(r.pos, r.score, '-', ()) for r in rr[i]] for i in range(len(matrix_names))] #use + and - to indicate strand for (matrix, matrix_name, result) in zip(matrices, matrix_names, results): motif_id = re.split(r'_', matrix_name)[-1].replace(".pfm", '') #find the id of the given morif motif_alternate_name = matrix_name.replace(motif_id, '')[:-1] #the alternate name of the motif is the name of the file without id and with cutted last character, that is _ if len(matrix) == 4: l = len(matrix[0]) if len(matrix) == 16: l = len(matrix[0] + 1) for r in sorted(result, key=lambda r: r[0]): strand = r[2] pos = r[0] hitseq = seq[pos:pos+l] #sequence score = format(r[1], '.15f') #round to 15 digits after floating point, already type str if key_for_bed_dict != '': start = pos + 1 end = pos + len(hitseq) #chromosom = key_for_bed_dict #instead of only the name of chromosom write the key to search in the bed_file else: start = int(positions[0]) + pos + 1 end = start + len(hitseq) - 1 #find the real start and end positions on the chromosom real_start = int(positions[0]) + int(start) #start of the peak + start of the motif within the peak, do not add 1, as bigwig is 0-based real_end = real_start + len(hitseq) #get the values from bw file bw_scores_control = np.mean(np.nan_to_num(np.array(list(bw_condition2.values(chromosom, real_start, real_end))))) bw_scores_overexpression = np.mean(np.nan_to_num(np.array(list(bw_condition1.values(chromosom, real_start, real_end))))) control_dict = save_bw_score(key_for_bed_dict, control_dict, bw_scores_control, float(score), which_score) overexpression_dict = save_bw_score(key_for_bed_dict, overexpression_dict, bw_scores_overexpression, float(score), which_score) bw_difference = abs(bw_scores_overexpression - bw_scores_control) if not np.isnan(bw_difference) and bw_difference != 0.0: #do not need to check for nan differences.append(bw_difference) #one doesnt need to close file that was opened like so, as python does it on itself. file.closed says True return control_dict, overexpression_dict, differences else: logger.info("The input for moods was not validated by the MOODS.parsers.pfm. Please check if it has the right format (note that the MOODS accepts only the old version of .pfm files, that is one without the header containing the name and id of the motif)") sys.exit()
def call_moods(one_motif, genome, output_directory, p_value, moods_bg): # setting standard parameters for moods pseudocount = 0.0001 if moods_bg == None: bg = MOODS.tools.flat_bg(4) else: bg = tuple(moods_bg) logger.info("moods will work with the p_value " + str(p_value) + " and the bg " + str(bg)) motif_name = os.path.basename(one_motif) moods_output_unsorted_name = os.path.join( output_directory, "moods_output_unsorted_" + os.path.splitext(motif_name)[0] + ".txt") moods_output_file_unsorted = open(moods_output_unsorted_name, 'w') moods_output_name = os.path.join( output_directory, "moods_output_" + os.path.splitext(motif_name)[0] + ".txt") moods_output_file = open(moods_output_name, 'w') matrix_names = [os.path.basename(one_motif)] matrices = [] matrices_rc = [] valid, matrix = pfm_to_log_odds(one_motif, bg, pseudocount) key_for_bed_dict = '' if valid: logger.info("please be patient, moods is working on the data") matrices.append(matrix) matrices_rc.append(MOODS.tools.reverse_complement(matrix, 4)) matrices_all = matrices + matrices_rc thresholds = [ MOODS.tools.threshold_from_p(m, bg, p_value, 4) for m in matrices_all ] scanner = MOODS.scan.Scanner(7) scanner.set_motifs(matrices_all, bg, thresholds) with open(genome) as handle: seq_iterator = bio.SimpleFastaParser(handle) for header, seq in seq_iterator: header_splitted = re.split(r':', header) if len(header_splitted) == 1: #if there are no positions given header = header + ":0-" #set the first position as 0 and split it once more header_splitted = re.split(r':', header) logger.info("moods works with " + header) else: #the given genome file is a file with peaks, so use the header of the peak as a key to search in the bed dictionary for additional information later on key_for_bed_dict = header chromosom = header_splitted[0] positions = re.split(r'-', header_splitted[-1]) results = scanner.scan(seq) fr = results[:len(matrix_names)] #forward strand rr = results[len(matrix_names):] #reverse strand results = [[(r.pos, r.score, '+', ()) for r in fr[i]] + [(r.pos, r.score, '-', ()) for r in rr[i]] for i in range(len(matrix_names)) ] #use + and - to indicate strand for (matrix, matrix_name, result) in zip(matrices, matrix_names, results): motif_id = re.split( r'_', matrix_name)[-1] #find the id of the given morif motif_alternate_name = matrix_name.replace( motif_id, '' )[: -1] #the alternate name of the motif is the name of the file without id and with cutted last character, that is _ if len(matrix) == 4: l = len(matrix[0]) if len(matrix) == 16: l = len(matrix[0] + 1) for r in sorted(result, key=lambda r: r[0]): strand = r[2] pos = r[0] hitseq = seq[pos:pos + l] #sequence #score = r[1] score = format( r[1], '.15f' ) #round to 15 digits after floating point, already type str if key_for_bed_dict != '': start = pos + 1 end = pos + len(hitseq) chromosom = key_for_bed_dict #instead of only the name of chromosom write the key to search in the bed_file else: start = int(positions[0]) + pos + 1 end = start + len(hitseq) - 1 #moods_output_file_unsorted.write('\t'.join([motif_id, motif_alternate_name, chromosom, str(start), str(end), strand, str(score)]) + '\n') moods_output_file_unsorted.write('\t'.join([ motif_id, motif_alternate_name, chromosom, str(start), str(end), strand, score ]) + '\n') moods_output_file_unsorted.close() moods_output_file.close() #now sort the output of moods os.system("cat " + moods_output_unsorted_name + " | sort -k 1 -V > " + moods_output_name) return moods_output_name else: logger.info( "The input for moods was not validated by the MOODS.parsers.pfm. Please check if it has the right format (note that the MOODS accepts only the old version of .pfm files, that is one without the header containing the name and id of the motif)" ) sys.exit()