def stream_fa(sequence_file: str) -> Generator[FastA, None, None]:
    '''
    Read a fastq file either gzipped or not and return it as a stream of tuples
    (Header, Sequence, Quality)
    :param infile:
    :return: Generator[FastA, None, None]
    '''

    if sequence_file.endswith('fq.gz') or sequence_file.endswith('fastq.gz'):
        with gzip.open(sequence_file, 'rt') as handle:
            for header, sequence, qual in Bio.SeqIO.QualityIO.FastqGeneralIterator(
                    handle):
                yield FastA(header, sequence)
    elif sequence_file.endswith('fq') or sequence_file.endswith('fastq'):
        with open(sequence_file) as handle:
            for header, sequence, qual in Bio.SeqIO.QualityIO.FastqGeneralIterator(
                    handle):
                yield FastA(header, sequence)
    elif sequence_file.endswith('fasta.gz') or sequence_file.endswith('fa.gz'):
        with gzip.open(sequence_file, 'rt') as handle:
            for (header, sequence) in FastaIO.SimpleFastaParser(handle):
                yield FastA(header, sequence)
    elif sequence_file.endswith('fasta') or sequence_file.endswith('fa'):
        with open(sequence_file) as handle:
            for (header, sequence) in FastaIO.SimpleFastaParser(handle):
                yield FastA(header, sequence)
    else:
        raise Exception(f'{sequence_file} not a sequence file.')
def stream_fa(infile):
    if infile.endswith('fasta.gz') or infile.endswith('fa.gz'):
        with gzip.open(infile, 'rt') as handle:
            for (header, sequence) in FastaIO.SimpleFastaParser(handle):
                yield (header, sequence)
    elif infile.endswith('fasta') or infile.endswith('fa'):
        with open(infile, 'rt') as handle:
            for (header, sequence) in FastaIO.SimpleFastaParser(handle):
                yield (header, sequence)
    else:
        raise Exception(f'{infile} not a sequence file.')
Beispiel #3
0
def openFasta(path):
    """ open fasta as simple dict (refname is trimmed after the first space)"""
    from Bio.SeqIO import FastaIO
    with open(path) as handle:
        # trim after the first space (as in ref in bam file)
        return {
            item[0].split()[0]: item[1]
            for item in dict(FastaIO.SimpleFastaParser(handle)).items()
        }
Beispiel #4
0
def get_proteins(path):

    genes = set()
    with open(path) as handle:
        for title, seq in fio.SimpleFastaParser(handle):

            genes.add(title)

    genes = res.get_unified_names(genes)
    return genes
Beispiel #5
0
def all_sequence_names_from_fasta_file(input_fasta_file_name):
  """Returns all sequence names from a fasta file.

  Args:
    input_fasta_file_name: string.

  Returns:
    list of string.
  """
  with tf.io.gfile.GFileText(input_fasta_file_name) as input_file:
    return [
        get_sequence_name_from(protein_name_incl_family)
        for protein_name_incl_family, _ in FastaIO.SimpleFastaParser(input_file)
    ]
Beispiel #6
0
def filter_fasta_file_by_sequence_name(input_fasta_file_name,
                                       acceptable_sequence_names):
  """Yield only entries from a fasta file that are in acceptable_sequence_names.

  Args:
    input_fasta_file_name: string. This file should contain fasta entries that
      are formatted seqName_actualFamily, as above.
    acceptable_sequence_names: iterable of string. This set just seqName (no
      actualFamily, as with `input_fasta_file_name`).

  Yields:
    strings, each of which is an entry for a fasta file.
  """
  acceptable_sequence_names = set(acceptable_sequence_names)
  with tf.io.gfile.GFileText(input_fasta_file_name) as input_file:
    for protein_name, sequence in FastaIO.SimpleFastaParser(input_file):
      if get_sequence_name_from(protein_name) in acceptable_sequence_names:
        yield '>' + protein_name + '\n' + sequence + '\n'
Beispiel #7
0
    def fasta_to_list(self, fasta_fn):
        ### convert .fasta file into a list
        filepath = Path.cwd() / fasta_fn
        # original list format: [(name, seq)]
        fasta_list = list(FastaIO.SimpleFastaParser(open(filepath)))
        # converting to list of lists (instead of list of tuples)
        fasta_list = list(map(list, fasta_list))
        # replace underscores for spaces and upper all chars
        for fasta_idx in range(0, len(fasta_list)):
            fasta_list[fasta_idx][0] = fasta_list[fasta_idx][0].replace(
                '_', ' ')
            fasta_list[fasta_idx][1] = fasta_list[fasta_idx][1].upper()
        # check if all sequences have the same size
        sizes_list = []
        for fasta_idx in range(0, len(fasta_list)):
            sizes_list.append(len(fasta_list[fasta_idx][1]))
        if not all(x == sizes_list[0] for x in sizes_list):
            logging.getLogger().info(
                f'WARNING: .fasta file has inconsistency: {fasta_fn}')

        return fasta_list
Beispiel #8
0
def write_phmmer_predictions(train_sequence_file, test_sequence_file,
                             parsed_output):
  """Write prediction csv file for all files in test_sequence_dir.

  The csv content is:
  sequence_name,true_label,predicted_label

  Where sequence_name is the uniprot identifier, including domain indices,
  and true and predicted label are pfam family accession ids.

  Args:
    train_sequence_file: string. Filename of fasta file of unaligned training
      sequences.
    test_sequence_file: string. Fasta files of unaligned test sequences.
    parsed_output: string. csv file for parsed phmmer outputs.
  """
  logging.info('Writing predictions to %s', parsed_output)

  with tf.io.gfile.GFile(test_sequence_file, 'r') as input_file:
    batched_fasta_iterable = pfam_utils.batch_iterable(
        FastaIO.SimpleFastaParser(input_file), _BLOCK_SIZE)

    input_dict_to_phmmer_function = [
        dict(
            train_sequence_file=train_sequence_file,
            list_of_protein_name_and_sequence=list_of_protein_name_and_sequence)
        for list_of_protein_name_and_sequence in batched_fasta_iterable
    ]

  results = parallel.RunInParallel(
      run_phmmer_for_query,
      input_dict_to_phmmer_function,
      _THREADS,
      cancel_futures=True)

  with tf.io.gfile.GFile(parsed_output, 'w') as parsed_output_file:
    for phmmer_query_result in results:
      for phmmer_output in phmmer_query_result:
        parsed_output_file.write(phmmer_output.format_as_csv() + '\n')
Beispiel #9
0
def openFasta(path):
    """ open fasta as simple dict """
    from Bio.SeqIO import FastaIO
    with open(path) as handle:
        return dict(FastaIO.SimpleFastaParser(handle))
Beispiel #10
0
    def preprocess_msa_refs(self, ref_fname, s0_h, cons, min_d=None):
        """ Preprocess MSA references by cutting out any
            insertions relative to s0.

        Args:
            ref_fname: MSA fasta file name.
            s0_h: name of header for s0.
            min_d: minimum distance to s0 refs can be.

        Return:
            A list of references aligned to s0.
        """
        # Parse refs
        with open(ref_fname) as f:
            recs = [(h,str_only_ACGTgap(s.upper())) for h,s in FastaIO.SimpleFastaParser(f)]

        # Pull out s0 msa sequence
        s0_msa_seq = ""
        for h,s in recs: 
            if h == s0_h:
                s0_msa_seq = s
                break

        # Cut out any gaps from s0 to get s0_seq
        s0_seq = s0_msa_seq.replace("-","")
        assert len(s0_msa_seq) > 0, "Reference %s not found in msa" % s0_h

        assert len(s0_seq) == len(cons), (len(s0_seq), len(cons))

        # Pull out indices that are not gaps in s0
        nongapinds = [j for j,c in enumerate(s0_msa_seq) if c != "-"]

        # Remove indels relative to s0
        recs2 = []
        for h,s in recs:
            assert len(s) == len(s0_msa_seq)
            s2 = ""
            for i in nongapinds:
                # If at one of these indices, it is an ambiguous base, coerce to be the same as ref
                if s[i] in "ACGT":
                    s2 += s[i]
                else: 
                    s2 += s0_msa_seq[i]
            assert len(s2) == len(s0_seq)
            recs2.append((h, str_c2i(s2)))

        # Check, for all indicecs that are "N", s0 is also "N"
        for h, s in recs2:
            for ci, c in enumerate(s):
                if c == 4: assert s0_seq[ci] == "N"

        # If this is the required distance from cons
        recs2_final = []
        if min_d is not None:
            for h, s in recs2:
                d = ham_nogaps(cons, s) 
                if d >= min_d:
                    recs2_final.append((h, s))
        else:
            recs2_final = recs2

        return s0_seq, recs2_final
Beispiel #11
0
def call_moods(one_motif, genome, output_directory, p_value, moods_bg, condition2, condition1, control_dict, overexpression_dict, differences, which_score):

	#check if this is a bigwig file
	bw_condition2 = pyBigWig.open(condition2)
	bw_condition1 = pyBigWig.open(condition1)
	if not bw_condition2.isBigWig() or not bw_condition1.isBigWig(): 
		logger.info("please provide the bigwig file!")
		sys.exit()
	else:
		# prepare everything for moods
		# setting standard parameters for moods
		# this code was token and modified from gitHub MOODS page
		pseudocount = 0.0001

		bg = tuple(moods_bg)

		matrix_names = [os.path.basename(one_motif)]

		matrices = []
		matrices_rc = []

		valid, matrix = pfm_to_log_odds(one_motif, bg, pseudocount)

		key_for_bed_dict = ''

		if valid:

			matrices.append(matrix)
			matrices_rc.append(MOODS.tools.reverse_complement(matrix,4))
			matrices_all = matrices + matrices_rc
			thresholds = [MOODS.tools.threshold_from_p(m, bg, p_value, 4) for m in matrices_all]

			scanner = MOODS.scan.Scanner(7)
			scanner.set_motifs(matrices_all, bg, thresholds)

			with open(genome) as handle:

				seq_iterator = bio.SimpleFastaParser(handle)

				for header, seq in seq_iterator:				

					header_splitted = re.split(r':', header)

					if len(header_splitted) == 1: #if there are no positions given
						header = header + ":0-" #set the first position as 0 and split it once more
						header_splitted = re.split(r':', header)
						logger.info("moods works with " + header)
					else: #the given genome file is a file with peaks, so use the header of the peak as a key to search in the bed dictionary for additional information later on
						key_for_bed_dict = header

					chromosom = header_splitted[0]
					positions = re.split(r'-', header_splitted[-1])

					results = scanner.scan(seq)

					fr = results[:len(matrix_names)] #forward strand
					rr = results[len(matrix_names):] #reverse strand

					results = [[(r.pos, r.score, '+', ()) for r in fr[i]] + 
						[(r.pos, r.score, '-', ()) for r in rr[i]] for i in range(len(matrix_names))] #use + and - to indicate strand

					for (matrix, matrix_name, result) in zip(matrices, matrix_names, results):

						motif_id = re.split(r'_', matrix_name)[-1].replace(".pfm", '') #find the id of the given morif
						motif_alternate_name = matrix_name.replace(motif_id, '')[:-1] #the alternate name of the motif is the name of the file without id and with cutted last character, that is _

						if len(matrix) == 4:
							l = len(matrix[0])
						if len(matrix) == 16:
							l = len(matrix[0] + 1)
						for r in sorted(result, key=lambda r: r[0]):
							strand = r[2]
							pos = r[0]
							hitseq = seq[pos:pos+l] #sequence
							score = format(r[1], '.15f') #round to 15 digits after floating point, already type str

							if key_for_bed_dict != '':
								start = pos + 1
								end = pos + len(hitseq)
								#chromosom = key_for_bed_dict #instead of only the name of chromosom write the key to search in the bed_file					
							else:
								start = int(positions[0]) + pos + 1
								end = start + len(hitseq) - 1

							#find the real start and end positions on the chromosom		
							real_start = int(positions[0]) + int(start) #start of the peak + start of the motif within the peak, do not add 1, as bigwig is 0-based
							real_end = real_start + len(hitseq)

							#get the values from bw file 
							bw_scores_control = np.mean(np.nan_to_num(np.array(list(bw_condition2.values(chromosom, real_start, real_end)))))
							bw_scores_overexpression = np.mean(np.nan_to_num(np.array(list(bw_condition1.values(chromosom, real_start, real_end)))))

							control_dict = save_bw_score(key_for_bed_dict, control_dict, bw_scores_control, float(score), which_score)
							overexpression_dict = save_bw_score(key_for_bed_dict, overexpression_dict, bw_scores_overexpression, float(score), which_score)

							bw_difference = abs(bw_scores_overexpression - bw_scores_control)
							
							if not np.isnan(bw_difference) and bw_difference != 0.0: #do not need to check for nan
								differences.append(bw_difference)

			#one doesnt need to close file that was opened like so, as python does it on itself. file.closed says True			
			return control_dict, overexpression_dict, differences
		else:
			logger.info("The input for moods was not validated by the MOODS.parsers.pfm. Please check if it has the right format (note that the MOODS accepts only the old version of .pfm files, that is one without the header containing the name and id of the motif)")
			sys.exit()
Beispiel #12
0
def call_moods(one_motif, genome, output_directory, p_value, moods_bg):

    # setting standard parameters for moods
    pseudocount = 0.0001

    if moods_bg == None:
        bg = MOODS.tools.flat_bg(4)
    else:
        bg = tuple(moods_bg)

    logger.info("moods will work with the p_value " + str(p_value) +
                " and the bg " + str(bg))

    motif_name = os.path.basename(one_motif)

    moods_output_unsorted_name = os.path.join(
        output_directory,
        "moods_output_unsorted_" + os.path.splitext(motif_name)[0] + ".txt")
    moods_output_file_unsorted = open(moods_output_unsorted_name, 'w')

    moods_output_name = os.path.join(
        output_directory,
        "moods_output_" + os.path.splitext(motif_name)[0] + ".txt")
    moods_output_file = open(moods_output_name, 'w')

    matrix_names = [os.path.basename(one_motif)]

    matrices = []
    matrices_rc = []

    valid, matrix = pfm_to_log_odds(one_motif, bg, pseudocount)

    key_for_bed_dict = ''

    if valid:

        logger.info("please be patient, moods is working on the data")

        matrices.append(matrix)
        matrices_rc.append(MOODS.tools.reverse_complement(matrix, 4))
        matrices_all = matrices + matrices_rc
        thresholds = [
            MOODS.tools.threshold_from_p(m, bg, p_value, 4)
            for m in matrices_all
        ]

        scanner = MOODS.scan.Scanner(7)
        scanner.set_motifs(matrices_all, bg, thresholds)

        with open(genome) as handle:

            seq_iterator = bio.SimpleFastaParser(handle)

            for header, seq in seq_iterator:

                header_splitted = re.split(r':', header)

                if len(header_splitted) == 1:  #if there are no positions given
                    header = header + ":0-"  #set the first position as 0 and split it once more
                    header_splitted = re.split(r':', header)
                    logger.info("moods works with " + header)
                else:  #the given genome file is a file with peaks, so use the header of the peak as a key to search in the bed dictionary for additional information later on
                    key_for_bed_dict = header

                chromosom = header_splitted[0]
                positions = re.split(r'-', header_splitted[-1])

                results = scanner.scan(seq)

                fr = results[:len(matrix_names)]  #forward strand
                rr = results[len(matrix_names):]  #reverse strand

                results = [[(r.pos, r.score, '+', ())
                            for r in fr[i]] + [(r.pos, r.score, '-', ())
                                               for r in rr[i]]
                           for i in range(len(matrix_names))
                           ]  #use + and - to indicate strand

                for (matrix, matrix_name,
                     result) in zip(matrices, matrix_names, results):

                    motif_id = re.split(
                        r'_', matrix_name)[-1]  #find the id of the given morif
                    motif_alternate_name = matrix_name.replace(
                        motif_id, ''
                    )[:
                      -1]  #the alternate name of the motif is the name of the file without id and with cutted last character, that is _

                    if len(matrix) == 4:
                        l = len(matrix[0])
                    if len(matrix) == 16:
                        l = len(matrix[0] + 1)
                    for r in sorted(result, key=lambda r: r[0]):
                        strand = r[2]
                        pos = r[0]
                        hitseq = seq[pos:pos + l]  #sequence
                        #score = r[1]
                        score = format(
                            r[1], '.15f'
                        )  #round to 15 digits after floating point, already type str

                        if key_for_bed_dict != '':
                            start = pos + 1
                            end = pos + len(hitseq)
                            chromosom = key_for_bed_dict  #instead of only the name of chromosom write the key to search in the bed_file
                        else:
                            start = int(positions[0]) + pos + 1
                            end = start + len(hitseq) - 1

                        #moods_output_file_unsorted.write('\t'.join([motif_id, motif_alternate_name, chromosom, str(start), str(end), strand, str(score)]) + '\n')
                        moods_output_file_unsorted.write('\t'.join([
                            motif_id, motif_alternate_name, chromosom,
                            str(start),
                            str(end), strand, score
                        ]) + '\n')

        moods_output_file_unsorted.close()
        moods_output_file.close()

        #now sort the output of moods
        os.system("cat " + moods_output_unsorted_name + " | sort -k 1 -V > " +
                  moods_output_name)

        return moods_output_name

    else:
        logger.info(
            "The input for moods was not validated by the MOODS.parsers.pfm. Please check if it has the right format (note that the MOODS accepts only the old version of .pfm files, that is one without the header containing the name and id of the motif)"
        )
        sys.exit()