def build_filesystem(): # DIRECTORY STRUCTURE global SCRATCH, BASEDIR, REFERENCES, DATA, TRIM_DIR, ASSEMBLY_DIR, ANNOTATION_DIR, MAP_DIR SCRATCH = Dir.make('/scratch/' + os.environ['USER']) BASEDIR = Dir('/Strong/proj/.data/Project_NTM') REFERENCES = Dir(BASEDIR.join("lib", "reference_genomes")) DATA = BASEDIR.make_subdir("data") # RAW_DIR = DATA.make_subdir("00_raw") TRIM_DIR = DATA.make_subdir('trimmed_reads') ASSEMBLY_DIR = DATA.make_subdir('assemblies') ANNOTATION_DIR = DATA.make_subdir('annotations') MAP_DIR = DATA.make_subdir('mapped_reads')
""" beagles[compbio] alma: grep , E-ALMA1.MAV_vs_NJH87.cf | more CP018363.1 83972 C T,G 0 34 34 SNP CP018363.1 148672 A G,C 0 14 14 SNP CP018363.1 1381815 G A,C 0 8 8 SNP CP018363.1 2334023 G A,T 0 30 30 SNP CP018363.1 2948951 T C,G 0 6 6 SNP CP018363.1 3621547 T C,G 0 35 35 SNP CP018363.1 4243092 A G,C 0 33 33 SNP CP018363.1 4562156 T C,G 0 15 15 SNP """ VCF_DIR = Dir("/Strong/proj/.data/alma") FASTA_DIR = VCF_DIR.make_subdir("fasta") MATRIX = FASTA_DIR.join("ALMA_matrix_N.fna") CSV_OUT = FASTA_DIR.join("ALMA_stats.csv") CSV_MUTATIONS = FASTA_DIR.join("ALMA_mutations.csv") POSITIONS = 5626623 files = VCF_DIR.files(endswith='.cf') records = [] multi_alleles = {} print(f"Building Matrix: {MATRIX} from {len(files)} files.") for i, file in enumerate(files): print(f"\t{i:02d} | {file.filename}") isolate = file.filename.split("_vs_")[0] seq = ['-'] * POSITIONS