if args.reference:
    sys.stderr.write("Parsing reference. This could take a while...\n")
    try:
        with open(args.reference + ".fai", "r") as fai:
            scafs_lengths = [line.split()[:2] for line in fai]
    except:
        sys.stderr.write(
            "WARNING: Could not parse fai file, vcf header will not contain contig entries...\n"
        )
        scafs_lengths = None

    with gzip.open(args.reference,
                   "r") if args.reference.endswith(".gz") else open(
                       args.reference, "r") as ref:
        refDict = dict(zip(*genomics.parseFasta(ref.read())))

else:
    refDict = None
#########################################################################################

genoFileReader = genomics.GenoFileReader(genoFile)

allNames = genoFileReader.names

if not args.samples: namesToUse = allNames
else: namesToUse = args.samples.split(",")

outFile.write("##fileformat=VCFv4.2\n")

if refDict:
Beispiel #2
0
#args = parser.parse_args("-n 5 -t test.trees -o test.topos.txt -w test.weights.B.csv -g A a,b,c -g B d,e,f -g C g,h,i -g D j,k,l".split())

if args.phylipIn: inFormat = "phylip"
else: inFormat = "fasta"

if args.phylipOut: outFormat = "phylip"
else: outFormat = "fasta"

l = args.lineLen


########################################################################

allText = sys.stdin.read()

if inFormat == "fasta": names, seqs = genomics.parseFasta(allText)
else: names, seqs = genomics.parsePhylip(allText)

if args.truncateNames: names = [name.split()[0] for name in names]

regions = [parseRegionText(r) for r in args.regions] if args.regions else []

if args.regionsFile:
        with open(args.regionsFile, "r") as rf:
            for line in rf: regions.append(parseRegionList(line.split()))

#only filter and chop sequences if necessary
if len(regions) >= 1:
    outNames = []
    outSeqs = []
    for seqName,start,end,ori in regions:
Beispiel #3
0
if args.genoFile:
    if args.genoFile[-3:] == ".gz": genoFile = gzip.open(args.genoFile, "w")
    else: genoFile = open(args.genoFile, "w")
else: genoFile = sys.stdout

if args.randomPhase:
    import random

#############################

#read sequence file
seqString = seqFile.read()

#parse
if args.format == "fasta":
    seqNames, seqs = genomics.parseFasta(seqString)
    multi = False

elif args.format == "phylip":
    #with phylip its possible to have multiple alignments, so we need to check if thats the case
    pieces = genomics.parsePhylip(seqString)
    if type(pieces) == tuple:
        seqNames, seqs = pieces
        multi = False
    else:
        _seqNames_, _seqs_ = zip(*pieces)
        multi = True

if not multi:
    #if there is a single set of sequences we parse it and output either as contigs or individuals
    #sequences to keep
args = parser.parse_args()

################################################################################

#get gene data
sys.stderr.write("Parsing annotation\n")
with gzip.open(args.annotation,
               "rt") if args.annotation.endswith(".gz") else open(
                   args.annotation, "rt") as ann:
    geneData = genomics.parseGenes(ann.readlines(), fmt=args.format)

#get scaffold names
sys.stderr.write("Loading reference genome\n")
with gzip.open(args.ref, "rt") if args.ref.endswith(".gz") else open(
        args.ref, "rt") as ref:
    scaffolds, _sequences_ = genomics.parseFasta(ref.read(),
                                                 makeUppercase=True)
    sequences = {}
    for i, scaffold in enumerate(scaffolds):
        sequences[scaffold] = _sequences_[i]

#open output
if not args.outFile: outFile = sys.stdout
else:
    outFile = gzip.open(args.outFile,
                        "wt") if args.outFile.endswith(".gz") else open(
                            args.outFile, "wt")

outFile.write("\t".join([
    "scaffold", "position", "codon_position", "substitution_type", "degeneracy"
]) + "\n")