Exemple #1
0
                    help="Gff file with masked regions")
parser.add_argument("-b",
                    "--logbase",
                    action="store",
                    dest="logbase",
                    default=2,
                    type=int,
                    help="Logbase of y axis")
args = parser.parse_args()

mutations = CollectionVCF(from_file=True,
                          in_file=args.input,
                          dont_parse_info_and_data=True)

if args.ref_genome:
    reference_genome = ReferenceGenome(args.ref_genome)
    reference_genome.find_gaps()
else:
    reference_genome = None

if args.masked_regions:
    masked_regions = {}
    with open(args.masked_regions) as gff_fd:
        for record in GFF.parse(gff_fd):
            masked_regions[record.id] = record
else:
    masked_regions = None

mutations.rainfall_plot(args.output_prefix,
                        single_fig=True,
                        dpi=args.dpi,
Exemple #2
0
parser.add_argument("-f", "--vcf_file", action="store", dest="vcf_file",
                    help="Vcf file with SNVs")
parser.add_argument("-a", "--annotations", action="store", dest="annotations", required=True,
                    help="Gff file with annotations of reference genome")
parser.add_argument("-m", "--masking", action="store", dest="masking", required=True,
                    help="Gff file with masked regions")
parser.add_argument("-d", "--threshold", action="store", dest="threshold", default=1000, type=int,
                    help="Threshold for extractig clusters. Depends on extraction method.")
parser.add_argument("-y", "--clustering_directory", action="store", dest="clust_dir", default="clustering",
                    help="Directory where to output additional data about clustering")

args = parser.parse_args()


index_file = args.reference_index if args.reference_index else "%s.idx" % (".".join(args.reference.split(".")[:-1]))
reference = ReferenceGenome(args.reference, index_file=index_file)

sample = args.sample_name

clustering_dir = args.clust_dir
distance_threshold = args.threshold
reference.find_gaps()
min_cluster_size = 3

annotations_dict = {}
annotation_synonym_dict = {"three_prime_UTR": "3'_UTR",
                           "five_prime_UTR": "5'_UTR",
                           "snoRNA": "ncRNA",
                           "snRNA": "ncRNA"
                           }
annotation_black_list = ["gene", "region", "ARS", "long_terminal_repeat",
    "--subplot_size",
    action="store",
    dest="subplot_size",
    default=4,
    type=int,
    help=
    "Size of subplot(inches) on distribution histogram with all scaffolds. Default: 4"
)

args = parser.parse_args()

count_dict = OrderedDict()

reference = ReferenceGenome(args.reference,
                            masked_regions=None,
                            index_file="refgen.idx",
                            filetype="fasta",
                            mode=args.parsing_mode,
                            black_list=[])

reference.find_gaps(min_gap_length=10)

for sample_name, vcf_file in zip(args.sample_names, args.input):
    count_dict[sample_name] = CollectionVCF(
        from_file=True, in_file=vcf_file,
        parse_only_coordinates=True).count_variants_in_windows(
            args.window_size,
            args.window_size if args.window_step is None else args.window_step,
            reference.region_length,
            ignore_scaffolds_shorter_than_window=True,
            output_prefix="%s.%s" % (args.output_prefix, sample_name),
            skip_empty_windows=False)
Exemple #4
0
def filter_by_power_05(record):
    return True if record.info_dict['Power'] >= 0.05 else False


def filter_by_power_10(record):
    return True if record.info_dict['Power'] >= 0.10 else False


if __name__ == "__main__":

    workdir = "analyse/"
    try:
        os.mkdir(workdir)
    except:
        pass
    reference = ReferenceGenome("LAN210_v0.10m.fasta",
                                index_file="LAN210_v0.10m.idx")

    sample_set_names_list = ["PmCDA1_3d"]

    clustering_dir = "clustering"
    rainfall_dir = "rainfall"
    distance_threshold = 1000
    reference.find_gaps()
    min_cluster_size = 3

    bad_regions_file = "LAN210_v0.10m_masked_all_not_in_good_genes.gff"
    """
    bad_regions = CollectionGFF(input_file=bad_regions_file,
                                from_file=True)
    """
    gff_file = "merged_annotations_Nagalakshmi_tranf_to_LAN210_v0.10m.gff3"
Exemple #5
0
                         "the lowest memory consuming - index_db. Default: index_db")
parser.add_argument("-b", "--region_black_list", action="store", dest="black_list", type=lambda s: s.split(","),
                    default=[],
                    help="Comma-separated ist of region names in genome to be not mutated")
parser.add_argument("-v", "--out_vcf", action="store", dest="out_vcf", required=True,
                    help=".vcf with snp set")
parser.add_argument("-n", "--number_of_mutations", action="store", dest="mut_number", type=int, default=10000,
                    help="Number of mutations in set")
parser.add_argument("-z", "--zygoty", action="store", dest="zygoty", default="h**o",
                    help="Zygoty of mutations in set. At moment only 100%% heterozygous or 100%% "
                         "homozygous sets can be generated. "
                         "Allowed values: h**o, hetero. Default: h**o")
parser.add_argument("-s", "--substitutions", action="store", dest="substitutions", type=parse_substititions,
                    help="Set of substitution. MUST BE i following form: "
                         "<ref_base_1>:<comma-separetad_alternatives>-<ref_base_2>:<comma-separetad_alternatives> "
                         "Alternatives can be not set. If so all possible variants will be choosen as "
                         "a set of alternatives."
                         "If no reference bases was set - sites with all four bases will be considered as mutation sites. "
                         "Example:  G:T,C,A-T-A:G ")

args = parser.parse_args()

masked_regions_dict = SeqIO.to_dict(GFF.parse(args.masking_gff))

reference_genome = ReferenceGenome(args.reference_genome, masked_regions=masked_regions_dict,
                                   index_file=args.ref_gen_idx, filetype="fasta", mode=args.parsing_mode,
                                   black_list=args.black_list)
reference_genome.generate_snp_set(args.mut_number, substitution_dict=args.substitutions,
                                  zygoty=args.zygoty, out_vcf=args.out_vcf)

Exemple #6
0
parser.add_argument("-m", "--masked_regions", action="store", dest="masked_regions", type=lambda s: s.split(","),
                    help="Comma-separated list of Gff file with masked regions")

parser.add_argument("-w", "--window_size", action="store", dest="window_size", default=100000, type=int,
                    help="Size of the windows Default: 100000")
parser.add_argument("-s", "--window_step", action="store", dest="window_step", default=None, type=int,
                    help="Step of the sliding windows. Default: window size, i.e windows are staking")
parser.add_argument("-p", "--parsing_mode", action="store", dest="parsing_mode", default="parse",
                    help="Parsing mode for input sequence file. "
                         "Possible variants: 'index_db', 'index', 'parse'(default)")


args = parser.parse_args()

reference = ReferenceGenome(args.input,
                            masked_regions=None,
                            index_file="refgen.idx",
                            filetype="fasta",
                            mode=args.parsing_mode,
                            black_list=[],
                            masking_gff_list=args.masked_regions)

reference.count_gaped_and_masked_positions_in_windows(args.window_size, args.window_step,
                                                      ignore_scaffolds_shorter_than_window=True,
                                                      output_prefix=args.output_prefix,
                                                      min_gap_len=1)