Beispiel #1
0
    def __init__(self, prefix, amrs, normalize=True):
        self.normalize = normalize
        self.sentences = []

        alltokens, allpos, alllemmas, allnes, alldepslines = self._loadFromFile(
            prefix + ".out")
        if amrs:
            allgraphs = open(prefix + ".graphs").read().split("\n\n")
            a = Alignments(prefix + ".alignments", allgraphs)
            allalignments = a.alignments

            for graph, alignments, depslines, tokens, pos, lemmas, nes in zip(
                    allgraphs, allalignments, alldepslines, alltokens, allpos,
                    alllemmas, allnes):
                graph = graph.strip()
                amr = amrannot.AMR.parse_AMR_line(graph.replace("\n", ""),
                                                  False)
                variables = {}
                for n, v in zip(amr.nodes, amr.node_values):
                    variables[n] = v
                role_triples = amr.get_triples3()
                relations = []
                for (var1, label, var2) in role_triples:
                    if label == "TOP":
                        relations.append(("TOP", ":top", var1))
                    else:
                        relations.append(
                            (str(var1), ":" + str(label), str(var2)))
                dependencies = []
                for line in depslines.split("\n"):
                    pattern = "^(.+)\(.+-([0-9]+), .+-([0-9]+)\)"
                    regex = re.match(pattern, line)
                    if regex is not None:
                        label = regex.group(1)
                        a = int(regex.group(2)) - 1
                        b = int(regex.group(3)) - 1
                        if a == -1:
                            dependencies.append((b, 'ROOT', b))
                        elif a != b:
                            dependencies.append((a, label, b))
                self.sentences.append(
                    AMRSentence(tokens, pos, lemmas, nes, dependencies,
                                variables, relations, graph, alignments))
        else:
            for depslines, tokens, pos, lemmas, nes in zip(
                    alldepslines, alltokens, allpos, alllemmas, allnes):
                dependencies = []
                for line in depslines.split("\n"):
                    pattern = "^(.+)\(.+-([0-9]+), .+-([0-9]+)\)"
                    regex = re.match(pattern, line)
                    if regex is not None:
                        label = regex.group(1)
                        a = int(regex.group(2)) - 1
                        b = int(regex.group(3)) - 1
                        if a == -1:
                            dependencies.append((b, 'ROOT', b))
                        elif a != b:
                            dependencies.append((a, label, b))
                self.sentences.append(
                    AMRSentence(tokens, pos, lemmas, nes, dependencies))
Beispiel #2
0
    def __init__(self, prefix, amrs, demo=False, normalize=True):
        self.normalize = normalize
        self.sentences = []

        if demo:
            blocks = prefix.split("\n\n")
        else:
            blocks = open(prefix + ".out", 'r').read().split("\n\n")

        alltokens, allpos, alllemmas, allnes, alldependencies = self._loadFromCoreNLP(
            blocks)

        if amrs:
            allgraphs = open(prefix + ".graphs").read().split("\n\n")
            a = Alignments(prefix + ".alignments", allgraphs)
            allalignments = a.alignments

            for graph, alignments, dependencies, tokens, pos, lemmas, nes in zip(
                    allgraphs, allalignments, alldependencies, alltokens,
                    allpos, alllemmas, allnes):
                graph = graph.strip()
                amr = amrannot.AMR.parse_AMR_line(graph.replace("\n", ""),
                                                  False)
                variables = {}
                for n, v in zip(amr.nodes, amr.node_values):
                    variables[n] = v
                role_triples = amr.get_triples3()
                relations = []
                for (var1, label, var2) in role_triples:
                    if label == "TOP":
                        relations.append(("TOP", ":top", var1))
                    else:
                        relations.append(
                            (str(var1), ":" + str(label), str(var2)))
                self.sentences.append(
                    AMRSentence(tokens, pos, lemmas, nes, dependencies,
                                variables, relations, graph, alignments))
        else:
            for dependencies, tokens, pos, lemmas, nes in zip(
                    alldependencies, alltokens, allpos, alllemmas, allnes):
                self.sentences.append(
                    AMRSentence(tokens, pos, lemmas, nes, dependencies))
Beispiel #3
0
def main():
    # parse the command line arguments
    parser = argparse.ArgumentParser()
    parser.add_argument("--download_gb", "-d", help="Name of the GenBank division to download (e.g. PLN or MAM).")
    parser.add_argument("--download_gb2", "-d2", help="""Name of the optional second GenBank division to download. Use this
                                                         option if the ingroup and outgroup are in different GenBank divisions.""")
    parser.add_argument("--path", "-p", help="Absolute path to download GenBank files to. Defaults to ./genbank/")
    parser.add_argument("--ingroup", "-i", help="Ingroup clade to build supermatrix.")
    parser.add_argument("--outgroup", "-o", help="Outgroup clade to build supermatrix.")
    parser.add_argument("--cores", "-c", help="The number of CPU cores to use for parallel processing. Defaults to the max available.")
    parser.add_argument("--id", "-id", help="UCLUST id threshold to cluster taxa. Defaults to 0.50")
    parser.add_argument("--evalue", "-e", help="BLAST E-value threshold to cluster taxa. Defaults to 1e-10")
    parser.add_argument("--length", "-l", help="Threshold of sequence length percent similarity to cluster taxa. Defaults to 0.25")
    parser.add_argument("--maxlength", "-maxl", help="Maximum length of sequences to include in UCLUST clusters. Defaults to 5000")
    parser.add_argument("--minlength", "-minl", help="Minimum length of sequences to include in UCLUST clusters. Defaults to 100")
    parser.add_argument("--min_clusters", "-minc", help="Minimum number of taxa needed for clusters. Defaults to 4")
    parser.add_argument("--max_ingroup", "-m", help="Maximum number of taxa to include in ingroup. Default is none (no maximum limit).") 
    parser.add_argument("--guide", "-g", help="""FASTA file containing sequences to guide cluster construction. If this option is 
                                                 selected then all-by-all BLAST comparisons are not performed.""")
    parser.add_argument("--alignments", "-a", nargs='+', help="List of aligned FASTA files to build supermatrix instead of mining GenBank.")
    parser.add_argument("--salignments", "-sa", nargs='+', help="List of SUMAC alignments to build supermatrix instead of mining GenBank.")
    parser.add_argument("--search", "-s", action='store_true', help="Turn on search and cluster mode. Will not make alignments or supermatrix.")
    parser.add_argument("--decisiveness", "-de", action='store_true', help="Calculate partial decisiveness. For larger matrices this may be slow.")
    parser.add_argument("--hac", action='store_true', help="Use HAC single-linkage clustering algorithm instead of the default UCLUST algorithm.")
    parser.add_argument("--slink", action='store_true', help="Use the SLINK clustering algorithm instead of the default UCLUST algorithm.")
    args = parser.parse_args()
 
    sys.stdout = Logger()
    color = Color()

    print("")
    print(color.blue + "SUMAC: supermatrix constructor v2.22" + color.done)
    print("")

    num_cores = multiprocessing.cpu_count()
    if args.cores and int(args.cores) <= num_cores:
        num_cores = int(args.cores) 

    if args.alignments:
        # if the user provides alignments:
        alignment_files = args.alignments
        alignments = Alignments(alignment_files, "aligned", num_cores)
    elif args.salignments:
        # if the user inputs SUMAC alignments from previous run
        alignment_files = args.salignments
        alignments = Alignments(alignment_files, "sumac_aligned", num_cores)
    else:
        if args.search:
            print(color.yellow + "Running in search and cluster mode. Clusters will not be aligned and supermatrix will not assembled." + color.done) 

        # first download and set up sqllite db if necessary
        if args.path:
            gb_dir = args.path
        else:
            gb_dir = os.path.abspath("genbank/")
        # if the user requests downloading
        if args.download_gb:
            divisions = [args.download_gb]
            if args.download_gb2:
                divisions.append(args.download_gb2)
            GenBankSetup.download(divisions, gb_dir)
            print(color.yellow + "Setting up SQLite database..." + color.done)
            gb = GenBankSetup.sqlite(gb_dir)
        # the user didn't request downloading, so check for genbank directory
        elif not os.path.exists(gb_dir):
            print(color.red + "GenBank database not downloaded. Re-run with the -d option. See --help for more details." + color.done)
            sys.exit(0)
        # the genbank directory exists so check for sequences and index them
        else:
            gb = GenBankSetup.sqlite(gb_dir)
        print(color.purple + "%i sequences indexed!" % len(gb) + color.done)

        # check for ingroup and outgroup
        if args.ingroup:
            ingroup = args.ingroup
            if args.outgroup:
                outgroup = args.outgroup
            else:
                outgroup = "NONE"
        else:
            print(color.red + "Please specify ingroup. See --help for details." + color.done)
            sys.exit(0)
        
        # search db for ingroup and outgroup sequences
        print(color.blue + "Ingroup = " + ingroup + color.done)
        if args.outgroup:
            print(color.blue + "Outgroup = " + outgroup + color.done)
        print(color.blue + "Searching for ingroup and outgroup sequences..." + color.done)
        if args.max_ingroup:
            search_results = GenBankSearch(gb, ingroup, outgroup, int(args.max_ingroup))
        else:
            search_results = GenBankSearch(gb, ingroup, outgroup)
        ingroup_keys = search_results.ingroup_keys
        outgroup_keys = search_results.outgroup_keys
        all_seq_keys = ingroup_keys + outgroup_keys
        if len(all_seq_keys) == 0:
            print(color.red + "No sequences found for the ingroup and outgroup!" + color.done)
            sys.exit(0)

        # determine sequence length similarity threshold
        length_threshold = 0.25
        if args.length:
            length_threshold = float(args.length)
        print(color.blue + "Using sequence length similarity threshold " + color.red + str(length_threshold) + color.done)

        # determine e-value threshold
        id_threshold = 0.5
        if args.id:
            id_threshold = float(args.id)
        print(color.blue + "Using UCLUST id threshold " + color.red + str(id_threshold) + color.done)
        
        # determine e-value threshold
        evalue_threshold = (1.0/10**10)
        if args.evalue:
            evalue_threshold = float(args.evalue)
        print(color.blue + "Using BLAST e-value threshold " + color.red + str(evalue_threshold) + color.done)

        # now build clusters, first checking whether we are using FASTA file of guide sequences
        # or doing all-by-all comparisons
        if args.guide:
            # use FASTA file of guide sequences
            print(color.blue + "Building clusters using the guide sequences..." + color.done)
            cluster_builder = GuidedClusterBuilder(args.guide, all_seq_keys, length_threshold, evalue_threshold, gb_dir, num_cores)
        else:
            # cluster using UCLUST
            uclust_error = False
            if not (args.slink or args.hac):
                print(color.blue + "Clustering sequences with UCLUST...")
                maxlength = 5000
                minlength = 100
                if args.maxlength:
                    maxlength = int(args.maxlength)
                if args.minlength:
                    minlength = int(args.minlength)
                cluster_builder = UCLUSTClusterBuilder(gb, all_seq_keys, gb_dir, num_cores, minlength, maxlength, length_threshold, id_threshold, evalue_threshold)
                if (cluster_builder.error == True):
                    uclust_error = True
                else:
                    print(color.purple + "Clustering completed..." + color.done)
            if (args.slink or args.hac) or (uclust_error == True):
                # make distance matrix
                print(color.blue + "Making distance matrix for all sequences..." + color.done)
                distance_matrix = DistanceMatrixBuilder(gb, all_seq_keys, length_threshold, gb_dir, num_cores).distance_matrix

                # cluster sequences
                if args.hac:
                    print(color.purple + "Clustering sequences using the HAC algorithm..." + color.done)
                    cluster_builder = HACClusterBuilder(all_seq_keys, distance_matrix, evalue_threshold)
                else:
                    print(color.purple + "Clustering sequences using the SLINK algorithm..." + color.done)
                    cluster_builder = SLINKClusterBuilder(all_seq_keys, distance_matrix, evalue_threshold)

        print(color.purple + "Found " + color.red + str(len(cluster_builder.clusters)) + color.purple + " clusters." + color.done)
        if len(cluster_builder.clusters) == 0:
            print(color.red + "No clusters found." + color.done)
            sys.exit(0)

        # filter clusters, make FASTA files
        print(color.yellow + "Building sequence matrices for each cluster." + color.done)
        min_clusters = 4
        if args.min_clusters:
            min_clusters = int(args.min_clusters)
        if (args.slink or args.hac or args.guide) or (uclust_error == True):
            cluster_builder.assemble_fasta(gb, min_clusters)
        else:
            cluster_builder.assemble_fasta_uclust(min_clusters)
        print(color.purple + "Kept " + color.red + str(len(cluster_builder.clusters)) + color.purple + " clusters, discarded those with < " + str(min_clusters) + " taxa." + color.done)
        
        # if we are in search and cluster mode we are done
        if args.search:
            sys.exit(0) 
        
        if len(cluster_builder.clusters) == 0:
            print(color.red + "No clusters left to align." + color.done)
            sys.exit(0)
        # now align each cluster with MAFFT
        print(color.blue + "Aligning clusters with MAFFT..." + color.done)
        alignments = Alignments(cluster_builder.cluster_files, "unaligned", num_cores)
    
    alignments.print_data()
    alignments.make_gene_region_csv()

    # concatenate alignments
    print(color.purple + "Concatenating alignments..." + color.done)
    supermatrix = Supermatrix(alignments)
   
    try:
        imp.find_module('matplotlib')
        imp.find_module('numpy')
        matplot = True
    except ImportError:
        matplot = False
        print(color.red + "Skipping generating graphs since matplotlib is not installed." + color.done)

    if not args.alignments: # and not args.salignments:
        # only make genbank_csv if the sequences were mined direct from genbank
        supermatrix.make_genbank_csv()
    supermatrix.print_data()
    if matplot:
        supermatrix.make_sequence_data_figure()
    if args.decisiveness:
        supermatrix.print_PD()
        if matplot:
            supermatrix.make_sequence_decisiveness_figure()
        supermatrix.make_decisiveness_csv()
    print(color.yellow + "Final supermatrix: " + color.red + "alignments/supermatrix_concatenated.fasta" + color.done)
Beispiel #4
0
    def __init__(self, prefix, amrs, demo=False, normalize=True):
        self.normalize = normalize
        self.sentences = []

        if demo:
            blocks = prefix.split("\n\n")
        else:
            # Read the syntactic info for each sentence, e.g. the tokesn, POS tages, lemmas, name entities, dep parses. Store them seperate lists.
            blocks = open(prefix + ".out", 'r').read().split("\n\n")
        alltokens, allpos, alllemmas, allnes, alldepslines = self._loadFromCoreNLP(
            blocks)

        if amrs:
            # Store each sentence's graph and alignments in lists.
            allgraphs = open(prefix + ".graphs").read().split("\n\n")

            a = Alignments(prefix + ".alignments", allgraphs)
            allalignments = a.alignments

            for graph, alignments, depslines, tokens, pos, lemmas, nes in zip(
                    allgraphs, allalignments, alldepslines, alltokens, allpos,
                    alllemmas, allnes):
                graph = graph.strip()
                amr = amrannot.AMR.parse_AMR_line(graph.replace("\n", ""),
                                                  False)
                variables = {}
                for n, v in zip(amr.nodes, amr.node_values):
                    variables[n] = v
                role_triples = amr.get_triples3()
                relations = []
                for (var1, label, var2) in role_triples:
                    if label == "TOP":
                        relations.append(("TOP", ":top", var1))
                    else:
                        relations.append(
                            (str(var1), ":" + str(label), str(var2)))
                dependencies = []
                for line in depslines.split("\n"):
                    pattern = "^(.+)\(.+-([0-9]+), .+-([0-9]+)\)"
                    regex = re.match(pattern, line)
                    if regex is not None:
                        label = regex.group(1)
                        a = int(regex.group(2)) - 1
                        b = int(regex.group(3)) - 1
                        if a == -1:
                            dependencies.append((b, 'ROOT', b))
                        elif a != b:
                            dependencies.append((a, label, b))
                self.sentences.append(
                    AMRSentence(tokens, pos, lemmas, nes, dependencies,
                                variables, relations, graph, alignments))
        else:
            for depslines, tokens, pos, lemmas, nes in zip(
                    alldepslines, alltokens, allpos, alllemmas, allnes):
                dependencies = []
                for line in depslines.split("\n"):
                    pattern = "^(.+)\(.+-([0-9]+), .+-([0-9]+)\)"
                    regex = re.match(pattern, line)
                    if regex is not None:
                        label = regex.group(1)
                        a = int(regex.group(2)) - 1
                        b = int(regex.group(3)) - 1
                        if a == -1:
                            dependencies.append((b, 'ROOT', b))
                        elif a != b:
                            dependencies.append((a, label, b))
                self.sentences.append(
                    AMRSentence(tokens, pos, lemmas, nes, dependencies))
Beispiel #5
0
 def test_mafft_alignment(self):
     import os
     from alignments import Alignments
     alignments = Alignments(["test.fasta"], "unaligned")
     self.assertTrue(os.path.exists("alignments/test.fasta"))