Exemple #1
0
def generate_allele_gff(generated_files_folder, config_info, upload_flag,
                        validate_flag):
    assembly_query = """MATCH (a:Assembly)
                        RETURN a.primaryKey AS assemblyID"""
    assembly_data_source = DataSource(get_neo_uri(config_info), assembly_query)

    if config_info.config["DEBUG"]:
        start_time = time.time()
        logger.info("Start time for generating Allele GFF files: %s",
                    time.strftime("%H:%M:%S", time.gmtime(start_time)))

    for assembly_result in assembly_data_source:
        assembly = assembly_result["assemblyID"]
        if assembly not in ignore_assemblies:
            generate_allele_gff_assembly(assembly, generated_files_folder,
                                         config_info, upload_flag,
                                         validate_flag)

    if config_info.config["DEBUG"]:
        end_time = time.time()
        logger.info("Created Allele GFF files - End time: %s",
                    time.strftime("%H:%M:%S", time.gmtime(end_time)))
        logger.info(
            "Time Elapsed: %s",
            time.strftime("%H:%M:%S", time.gmtime(end_time - start_time)))
Exemple #2
0
def generate_human_genes_interacting_with(generated_files_folder, config_info,
                                          upload_flag, validate_flag):
    query = '''MATCH (s:Species)-[:FROM_SPECIES]-(g:Gene)--(i:InteractionGeneJoin)--(g2:Gene)-[:FROM_SPECIES]-(s2:Species)
               WHERE s.primaryKey ='NCBITaxon:2697049'
                   AND s2.primaryKey = 'NCBITaxon:9606'
               RETURN DISTINCT g2.primaryKey AS GeneID,
                               g2.symbol AS Symbol,
                               g2.name AS Name'''

    if config_info.config["DEBUG"]:
        logger.info("Human Genes Interacts With query")
        logger.info(query)
        start_time = time.time()
        logger.info("Start time: %s",
                    time.strftime("%H:%M:%S", time.gmtime(start_time)))

    data_source = DataSource(get_neo_uri(config_info), query)
    hgiw = human_genes_interacting_with_file_generator.HumanGenesInteractingWithFileGenerator(
        data_source, config_info, generated_files_folder)
    hgiw.generate_file(upload_flag=upload_flag, validate_flag=validate_flag)

    if config_info.config["DEBUG"]:
        end_time = time.time()
        logger.info(
            "Created Human Genees Interacting with file - End time: %s",
            time.strftime("%H:%M:%S", time.gmtime(end_time)))
        logger.info(
            "Time Elapsed: %s",
            time.strftime("%H:%M:%S", time.gmtime(end_time - start_time)))
Exemple #3
0
def generate_gene_cross_reference_file(generated_files_folder, config_info,
                                       upload_flag, validate_flag):
    gene_cross_reference_query = '''MATCH (g:Gene)--(cr:CrossReference)
                          RETURN g.primaryKey as GeneID,
                                 cr.globalCrossRefId as GlobalCrossReferenceID,
                                 cr.crossRefCompleteUrl as CrossReferenceCompleteURL,
                                 cr.page as ResourceDescriptorPage,
                                 g.taxonId as TaxonID'''

    if config_info.config["DEBUG"]:
        logger.info("Gene Cross Reference query")
        logger.info(gene_cross_reference_query)
        start_time = time.time()
        logger.info("Start time: %s",
                    time.strftime("%H:%M:%S", time.gmtime(start_time)))

    data_source = DataSource(get_neo_uri(config_info),
                             gene_cross_reference_query)
    gene_cross_reference = gene_cross_reference_file_generator.GeneCrossReferenceFileGenerator(
        data_source, generated_files_folder, config_info)
    gene_cross_reference.generate_file(upload_flag=upload_flag,
                                       validate_flag=validate_flag)

    if config_info.config["DEBUG"]:
        end_time = time.time()
        logger.info("Gene Cross Reference file - End time: %s",
                    time.strftime("%H:%M:%S", time.gmtime(end_time)))
        logger.info(
            "Time Elapsed: %s",
            time.strftime("%H:%M:%S", time.gmtime(end_time - start_time)))
Exemple #4
0
def generate_uniprot_cross_reference(generated_files_folder, config_info,
                                     upload_flag, validate_flag):
    uniprot_cross_reference_query = '''MATCH (g:Gene)--(cr:CrossReference)
                                WHERE cr.prefix = "UniProtKB"
                                RETURN g.primaryKey as GeneID,
                                    cr.globalCrossRefId as GlobalCrossReferenceID'''

    if config_info.config["DEBUG"]:
        logger.info("UniProt Cross Reference query")
        logger.info(uniprot_cross_reference_query)
        start_time = time.time()
        logger.info("Start time: %s",
                    time.strftime("%H:%M:%S", time.gmtime(start_time)))

    data_source = DataSource(get_neo_uri(config_info),
                             uniprot_cross_reference_query)
    ucf = uniprot_cross_reference_generator.UniProtGenerator(
        data_source, config_info, generated_files_folder)
    ucf.generate_file(upload_flag=upload_flag, validate_flag=validate_flag)

    if config_info.config["DEBUG"]:
        end_time = time.time()
        logger.info("Created UniProt Cross Reference file - End time: %s",
                    time.strftime("%H:%M:%S", time.gmtime(end_time)))
        logger.info(
            "Time Elapsed: %s",
            time.strftime("%H:%M:%S", time.gmtime(end_time - start_time)))
Exemple #5
0
def generate_variant_allele_files(generated_files_folder, skip_chromosomes,
                                  config_info, upload_flag, validate_flag):
    species_query = """MATCH (s:Species)
                        WHERE s.primaryKey <> "NCBITaxon:9606"
                        RETURN s.primaryKey as speciesID"""
    species_data_source = DataSource(get_neo_uri(config_info), species_query)

    if config_info.config["DEBUG"]:
        start_time = time.time()
        logger.info("Start time for generating Variant Alleles files: %s",
                    time.strftime("%H:%M:%S", time.gmtime(start_time)))

    for species_result in species_data_source:
        species = species_result["speciesID"]
        generate_variant_allele_species_file(species, generated_files_folder,
                                             skip_chromosomes, config_info,
                                             upload_flag, validate_flag)

    if config_info.config["DEBUG"]:
        end_time = time.time()
        logger.info("Created Variant Allele files - End time: %s",
                    time.strftime("%H:%M:%S", time.gmtime(end_time)))
        logger.info(
            "Time Elapsed: %s",
            time.strftime("%H:%M:%S", time.gmtime(end_time - start_time)))
Exemple #6
0
def generate_db_summary_file(generated_files_folder, config_info, upload_flag,
                             validate_flag):
    db_summary_query = '''MATCH (entity)
                          WITH labels(entity) AS entityTypes
                          RETURN count(entityTypes) AS frequency,
                          entityTypes'''

    if config_info.config["DEBUG"]:
        logger.info("DB Summary Query")
        logger.info(db_summary_query)
        start_time = time.time()
        logger.info("Start time: %s",
                    time.strftime("%H:%M:%S", time.gmtime(start_time)))

    data_source = DataSource(get_neo_uri(config_info), db_summary_query)
    db_summary = db_summary_file_generator.DbSummaryFileGenerator(
        data_source, generated_files_folder, config_info)
    db_summary.generate_file(upload_flag=upload_flag,
                             validate_flag=validate_flag)

    if config_info.config["DEBUG"]:
        end_time = time.time()
        logger.info("Created DB Summary file - End time: %s",
                    time.strftime("%H:%M:%S", time.gmtime(end_time)))
        logger.info(
            "Time Elapsed: %s",
            time.strftime("%H:%M:%S", time.gmtime(end_time - start_time)))
Exemple #7
0
def generate_allele_gff_assembly(assembly, generated_files_folder, config_info,
                                 upload_flag, validate_flag):
    query = '''MATCH (v:Variant)-[:ASSOCIATION]->(gl:GenomicLocation)-[:ASSOCIATION]->(:Assembly {primaryKey: "''' + assembly + '''"}),
                     (a:Allele)<-[:VARIATION]-(v:Variant)-[:LOCATED_ON]->(c:Chromosome),
                     (v:Variant)-[:VARIATION_TYPE]->(so:SOTerm),
                     (v:Variant)<-[:COMPUTED_GENE]-(g:Gene)-[:ASSOCIATION]->(glc:GeneLevelConsequence)<-[:ASSOCIATION]-(v:Variant)
WITH c,a,v,gl,so,
     COLLECT({geneID: g.primaryKey,
              geneSymbol: g.symbol,
              geneLevelConsequence: glc.geneLevelConsequence,
              impact: glc.impact}) AS glcs
WITH c.primaryKey AS chromosome,
     a.primaryKey AS ID,
     a.symbol AS symbol,
     a.symbolText AS symbol_text,
     COLLECT(DISTINCT {ID: v.primaryKey,
              genomicVariantSequence: v.genomicVariantSequence,
              genomicReferenceSequence: v.genomicReferenceSequence,
              soTerm: so.name,
              start: gl.start,
              end: gl.end,
              chromosome: gl.chromosome,
              geneLevelConsequences: glcs}) AS variants,
     COUNT(DISTINCT v.primaryKey) AS num
WHERE num > 1
RETURN chromosome, ID, symbol, symbol_text, variants
ORDER BY chromosome'''

    if config_info.config["DEBUG"]:
        logger.info("Allele GFF query")
        logger.info(query)
        start_time = time.time()
        logger.info("Start time: %s",
                    time.strftime("%H:%M:%S", time.gmtime(start_time)))

    data_source = DataSource(get_neo_uri(config_info), query)
    agff = allele_gff_file_generator.AlleleGffFileGenerator(
        assembly, data_source, generated_files_folder, config_info)
    agff.generate_assembly_file(upload_flag=upload_flag,
                                validate_flag=validate_flag)

    if config_info.config["DEBUG"]:
        end_time = time.time()
        logger.info("Created Allele GFF file - End time: %s",
                    time.strftime("%H:%M:%S", time.gmtime(end_time)))
        logger.info(
            "Time Elapsed: %s",
            time.strftime("%H:%M:%S", time.gmtime(end_time - start_time)))
Exemple #8
0
def generate_orthology_file(generated_files_folder, config_info, upload_flag,
                            validate_flag):
    orthology_query = '''MATCH (species1)<-[sa:FROM_SPECIES]-(gene1:Gene)-[o:ORTHOLOGOUS]->(gene2:Gene)-[sa2:FROM_SPECIES]->(species2:Species)
                       WHERE o.strictFilter
                       OPTIONAL MATCH (algorithm:OrthoAlgorithm)-[m:MATCHED]-(ogj:OrthologyGeneJoin)-[association:ASSOCIATION]-(gene1)
                       WHERE ogj.primaryKey = o.primaryKey
                       OPTIONAL MATCH (algorithm2:OrthoAlgorithm)-[m2:NOT_MATCHED]-(ogj2:OrthologyGeneJoin)-[ASSOCIATION]-(gene1)
                       WHERE ogj2.primaryKey = o.primaryKey
                       RETURN gene1.primaryKey AS gene1ID,
                              gene1.symbol AS gene1Symbol,
                              gene2.primaryKey AS gene2ID,
                              gene2.symbol AS gene2Symbol,
                              COLLECT(DISTINCT algorithm.name) as Algorithms,
                              count(DISTINCT algorithm.name) AS numAlgorithmMatch,
                              count(DISTINCT algorithm2.name) AS numAlgorithmNotMatched,
                              toString(o.isBestScore) AS best,
                              toString(o.isBestRevScore) AS bestRev,
                              species1.primaryKey AS species1TaxonID,
                              species1.name AS species1Name,
                              species2.primaryKey AS species2TaxonID,
                              species2.name AS species2Name'''

    if config_info.config["DEBUG"]:
        logger.info("Orthology query")
        logger.info(orthology_query)
        start_time = time.time()
        logger.info("Start time: %s",
                    time.strftime("%H:%M:%S", time.gmtime(start_time)))

    data_source = DataSource(get_neo_uri(config_info), orthology_query)
    of = orthology_file_generator.OrthologyFileGenerator(
        data_source, generated_files_folder, config_info)
    of.generate_file(upload_flag=upload_flag, validate_flag=validate_flag)

    if config_info.config["DEBUG"]:
        end_time = time.time()
        logger.info("Created VCF file - End time: %s",
                    time.strftime("%H:%M:%S", time.gmtime(end_time)))
        logger.info(
            "Time Elapsed: %s",
            time.strftime("%H:%M:%S", time.gmtime(end_time - start_time)))
Exemple #9
0
def generate_expression_file(generated_files_folder, config_info,
                             taxon_id_fms_subtype_map, upload_flag,
                             validate_flag):
    expression_query = '''MATCH (speciesObj:Species)<-[:FROM_SPECIES]-(geneObj:Gene)-[:ASSOCIATION]->(begej:BioEntityGeneExpressionJoin)--(term)
                          WITH {primaryKey: speciesObj.primaryKey, name: speciesObj.name} AS species,
                               {primaryKey: geneObj.primaryKey, symbol: geneObj.symbol, dataProvider: geneObj.dataProvider} AS gene,
                               begej,
                               COLLECT(term) AS terms
                          MATCH (begej:BioEntityGeneExpressionJoin)<-[:ASSOCIATION]-(exp:ExpressionBioEntity)-[a:ANATOMICAL_STRUCTURE|CELLULAR_COMPONENT|ANATOMICAL_SUB_SUBSTRUCTURE|CELLULAR_COMPONENT_QUALIFIER|ANATOMICAL_SUB_STRUCTURE_QUALIFIER|ANATOMICAL_STRUCTURE_QUALIFIER]->(ontology:Ontology)
                          RETURN species,
                                 gene,
                                 terms,
                                 begej.primaryKey as begejId,
                                 exp.whereExpressedStatement AS location,
                                 COLLECT({edge: type(a),
                                          primaryKey: ontology.primaryKey,
                                          name: ontology.name}) AS ontologyPaths'''

    if config_info.config["DEBUG"]:
        logger.info("Expression query")
        logger.info(expression_query)
        start_time = time.time()
        logger.info("Start time: %s",
                    time.strftime("%H:%M:%S", time.gmtime(start_time)))

    data_source = DataSource(get_neo_uri(config_info), expression_query)
    expression = expression_file_generator.ExpressionFileGenerator(
        data_source, generated_files_folder, config_info,
        taxon_id_fms_subtype_map)
    expression.generate_file(upload_flag=upload_flag,
                             validate_flag=validate_flag)

    if config_info.config["DEBUG"]:
        end_time = time.time()
        logger.info("Created Expression file - End time: %s",
                    time.strftime("%H:%M:%S", time.gmtime(end_time)))
        logger.info(
            "Time Elapsed: %s",
            time.strftime("%H:%M:%S", time.gmtime(end_time - start_time)))
Exemple #10
0
def generate_disease_file(generated_files_folder, config_info,
                          taxon_id_fms_subtype_map, upload_flag,
                          validate_flag):
    disease_query = '''MATCH (disease:DOTerm)-[:ASSOCIATION]-(dej:Association:DiseaseEntityJoin)-[:ASSOCIATION]-(object)-[:FROM_SPECIES]-(species:Species)
                   WHERE (object:Gene OR object:Allele OR object:AffectedGenomicModel)
                         AND dej.joinType IN ["IS_MARKER_FOR", // need to remove when removed from database
                                              "IS_IMPLICATED_IN", // need to remove when removed from database
                                              "IS_NOT_IMPLICATED_IN", // need to remove when removed from database
                                              "IS_MODEL_OF",
                                              "is_model_of",
                                              "is_implicated_in",
                                              "is_not_implicated_in",
                                              "is_biomarker_for",
                                              "implicated_via_orthology",
                                              "biomarker_via_orthology"]
                   MATCH (dej:Association:DiseaseEntityJoin)-[:EVIDENCE]->(pj:PublicationJoin),
                         (p:Publication)-[:ASSOCIATION]->(pj:PublicationJoin)-[:ASSOCIATION]->(ec:Ontology:ECOTerm)
                   OPTIONAL MATCH (dej:DiseaseEntityJoin)-[ecType]-(experimental_condition:ExperimentalCondition)
                   WITH disease, dej, object, species, pj, p, ec,
                        COLLECT(DISTINCT {type: type(ecType),
                                          statement: experimental_condition.conditionStatement}) AS experimentalConditions
                   OPTIONAL MATCH (dej:DiseaseEntityJoin)-[:ANNOTATION_SOURCE_CROSS_REFERENCE]->(ascr:CrossReference)
                   WITH disease, dej, object, species, pj, p, ec, experimentalConditions,
                        COLLECT(DISTINCT {curatedDB: ascr.curatedDB, displayName: ascr.displayName}) AS source
                   OPTIONAL MATCH (object:Gene)-[:ASSOCIATION]->(dej:Association:DiseaseEntityJoin)<-[:ASSOCIATION]-(otherAssociatedEntity)
                   OPTIONAL MATCH (pj:PublicationJoin)-[:MODEL_COMPONENT|PRIMARY_GENETIC_ENTITY]-(inferredFromEntity)
                   OPTIONAL MATCH (dej:Association:DiseaseEntityJoin)-[:FROM_ORTHOLOGOUS_GENE]->(oGene:Gene),
                                  (gene:Gene)-[o:ORTHOLOGOUS]->(oGene:Gene)
                   WHERE o.strictFilter AND ec.primaryKey IN ["ECO:0000250", "ECO:0000266", "ECO:0000501"] // ISS, ISO, and IEA respectively
                   //OPTIONAL MATCH (object)-[IS_ALLELE_OF]->(gene:Gene)
                   RETURN DISTINCT
                          dej.primaryKey AS dejID,
                          species.primaryKey AS taxonId,
                          species.name AS speciesName,
                          COLLECT(DISTINCT oGene.primaryKey) AS withOrthologs,
                          labels(object) AS objectType,
                          object.primaryKey AS dbObjectID,
                          object.symbol AS dbObjectSymbol,
                          object.name AS dbObjectName,
                          toLower(dej.joinType) AS associationType,
                          //COLLECT(DISTINCT gene.primaryKey) AS inferredGeneAssociation,
                          disease.doId AS DOID,
                          disease.name AS DOtermName,
                          COLLECT(DISTINCT {pubModID: p.pubModId,
                                            pubMedID: p.pubMedId,
                                            evidenceCode:ec.primaryKey,
                                            evidenceCodeName: ec.name,
                                            inferredFromEntity: inferredFromEntity,
                                            otherAssociatedEntityID: otherAssociatedEntity.primaryKey}) AS evidence,
                          experimentalConditions,
                          REDUCE(t = "1900-01-01", c IN COLLECT(left(pj.dateAssigned, 10)) | CASE WHEN c > t THEN c ELSE t END) AS dateAssigned,
                          ///takes most recent date
                          source,
                          dej.dataProvider AS dataProvider'''

    if config_info.config["DEBUG"]:
        logger.info("Disease Association Query: ")
        logger.info(disease_query)
        start_time = time.time()
        logger.info("Start time: %s",
                    time.strftime("%H:%M:%S", time.gmtime(start_time)))

    data_source = DataSource(get_neo_uri(config_info), disease_query)
    disease = disease_file_generator.DiseaseFileGenerator(
        data_source, generated_files_folder, config_info,
        taxon_id_fms_subtype_map)
    disease.generate_file(upload_flag=upload_flag, validate_flag=validate_flag)

    if config_info.config["DEBUG"]:
        end_time = time.time()
        logger.info("Created Disease Association file - End time: %s",
                    time.strftime("%H:%M:%S", time.gmtime(end_time)))
        logger.info(
            "Time Elapsed: %s",
            time.strftime("%H:%M:%S", time.gmtime(end_time - start_time)))
Exemple #11
0
def generate_vcf_file(assembly, generated_files_folder, skip_chromosomes,
                      config_info, upload_flag, validate_flag):
    logger.info("Querying Assembly: " + assembly)

    variants_query = '''MATCH (s:Species)-[:FROM_SPECIES]-(a:Allele)-[:VARIATION]-(v:Variant)-[l:LOCATED_ON]->(c:Chromosome),
                              (v:Variant)-[:VARIATION_TYPE]->(st:SOTerm),
                              (v:Variant)-[:ASSOCIATION]->(p:GenomicLocation)-[:ASSOCIATION]->(assembly:Assembly {primaryKey: "''' + assembly + '''"})
                     WHERE NOT v.genomicReferenceSequence = v.genomicVariantSequence
                           OR v.genomicVariantSequence = ""
                     OPTIONAL MATCH (a:Allele)-[:IS_ALLELE_OF]-(g:Gene)
                     WITH COLLECT(DISTINCT {symbol: a.symbol,
                                            symbolText: a.symbolText,
                                            id: a.primaryKey}) AS alleles,
                          s, v, l, c, st, p, assembly
                     OPTIONAL MATCH (v:Variant)-[:ASSOCIATION]-(glc:GeneLevelConsequence)-[:ASSOCIATION]-(g:Gene)
                     WITH alleles, s, v, l, c, st, p, assembly,
                          COLLECT(DISTINCT {gene: g.primaryKey,
                                            geneSymbol: g.symbol,
                                            consequence: glc.geneLevelConsequence,
                                            impact: glc.impact}) AS geneConsequences
                     OPTIONAL MATCH (v:Variant)-[:ASSOCIATION]-(tlc:TranscriptLevelConsequence)-[:ASSOCIATION]-(t:Transcript)
                     RETURN c.primaryKey AS chromosome,
                            v.globalId AS globalId,
                            right(v.paddingLeft,1) AS paddingLeft,
                            v.genomicReferenceSequence AS genomicReferenceSequence,
                            v.genomicVariantSequence AS genomicVariantSequence,
                            v.hgvsNomenclature AS hgvsNomenclature,
                            v.dataProvider AS dataProvider,
                            assembly.primaryKey AS assembly,
                            alleles,
                            geneConsequences,
                            COLLECT(DISTINCT {transcript: t.primaryKey,
                                              transcriptGFF3ID: t.gff3ID,
                                              transcriptGFF3Name: t.name,
                                              consequence: tlc.molecularConsequences,
                                              impact: tlc.impact}) AS transcriptConsequences,
                            p.start AS start,
                            p.end AS end,
                            s.name AS species,
                            st.nameKey AS soTerm
                     '''

    if config_info.config["DEBUG"]:
        logger.info(variants_query)
        start_time = time.time()
        logger.info("Start time: %s",
                    time.strftime("%H:%M:%S", time.gmtime(start_time)))

    data_source = DataSource(get_neo_uri(config_info), variants_query)
    gvf = vcf_file_generator.VcfFileGenerator(data_source,
                                              generated_files_folder,
                                              config_info)
    gvf.generate_files(skip_chromosomes=skip_chromosomes,
                       upload_flag=upload_flag,
                       validate_flag=validate_flag)

    if config_info.config["DEBUG"]:
        end_time = time.time()
        logger.info("Created VCF file - End time: %s",
                    time.strftime("%H:%M:%S", time.gmtime(end_time)))
        logger.info(
            "Time Elapsed: %s",
            time.strftime("%H:%M:%S", time.gmtime(end_time - start_time)))
Exemple #12
0
def generate_variant_allele_species_file(species_id, generated_files_folder,
                                         skip_chromosomes, config_info,
                                         upload_flag, validate_flag):
    logger.info("Querying Species: " + species_id)

    species_id_restriction = ""
    if species_id != "COMBINED":
        species_id_restriction = " {primaryKey: \"" + species_id + "\"}"
    variant_allele_query = '''MATCH (s:Species''' + species_id_restriction + ''')<-[:FROM_SPECIES]-(a:Allele)
                              OPTIONAL MATCH (a:Allele)<-[:VARIATION]-(v:Variant)-[:LOCATED_ON]->(c:Chromosome),
                                             (v:Variant)-[:VARIATION_TYPE]->(st:SOTerm),
                                             (v:Variant)-[:ASSOCIATION]->(p:GenomicLocation)-[:ASSOCIATION]->(assembly:Assembly)
                              WHERE NOT v.genomicReferenceSequence = v.genomicVariantSequence
                                    OR v.genomicVariantSequence = ""
                              WITH v, a, s, c, p, st, assembly
                              OPTIONAL MATCH (v:Variant)<-[:COMPUTED_GENE]-(gene:Gene)
                              WITH v, a, s, c, p, st, assembly, gene
                              OPTIONAL MATCH (v:Variant)-[:ASSOCIATION]->(glc:GeneLevelConsequence)<-[:ASSOCIATION]->(gene:Gene)-[:COMPUTED_GENE]->(v:Variant)
                              WITH v, a, s, c, p, st, assembly,
                                   COLLECT(glc.geneLevelConsequence) AS geneConsequences,
                                   COLLECT(DISTINCT {id: gene.primaryKey,
                                            symbol: gene.symbol}) AS variantAffectedGenes
                              OPTIONAL MATCH (a:Allele)-[:IS_ALLELE_OF]->(gene:Gene)
                              WITH v, a, s, c, p, st, assembly, geneConsequences, variantAffectedGenes,
                                   COLLECT(DISTINCT {id: gene.primaryKey,
                                            symbol: gene.symbol}) AS alleleAssociatedGenes
                              OPTIONAL MATCH (a:Allele)-[:ALSO_KNOWN_AS]-(syn:Synonym)
                              WITH v, a, s, c, p, st, assembly, geneConsequences, variantAffectedGenes, alleleAssociatedGenes,
                                   COLLECT(DISTINCT syn.primaryKey) AS alleleSyns
                              OPTIONAL MATCH (v:Variant)-[:ALSO_KNOWN_AS]-(syn:Synonym)
                              WITH v, a, s, c, p, st, assembly, geneConsequences, variantAffectedGenes, alleleAssociatedGenes, alleleSyns,
                                   COLLECT(DISTINCT syn.primaryKey) AS variantSyns
                              WITH v, a, s, c, p, st, assembly, geneConsequences, variantAffectedGenes, alleleAssociatedGenes, alleleSyns, variantSyns
                              OPTIONAL MATCH (v:Variant)-[:ASSOCIATION]->(pub:Publication)
                              WITH v, a, s, c, p, st, assembly, geneConsequences, alleleAssociatedGenes, alleleSyns, variantSyns, variantAffectedGenes,
                                   COLLECT(pub.primaryKey) AS pubIds
                              OPTIONAL MATCH (v:Variant)-[:CROSS_REFERENCE]->(vcr:CrossReference)<-[:CROSS_REFERENCE]-(a:Allele)
                              WITH v, a, s, c, p, st, assembly, geneConsequences, alleleAssociatedGenes, alleleSyns, variantSyns, variantAffectedGenes, pubIds,
                                   COLLECT(vcr.name) AS variantCrossReferences
                              OPTIONAL MATCH (v:Variant)-[:HAS_PHENOTYPE]-(phenotype:Phenotype)
                              WITH v, a, s, c, p, st, assembly, geneConsequences, alleleAssociatedGenes, alleleSyns, variantSyns, variantAffectedGenes, pubIds,
                                   variantCrossReferences,
                                   COUNT(phenotype) AS variantPhenotypeCount
                              OPTIONAL MATCH (a:Allele)-[:HAS_PHENOTYPE]-(phenotype:Phenotype)
                              WITH v, a, s, c, p, st, assembly, geneConsequences, alleleAssociatedGenes, alleleSyns, variantSyns, variantAffectedGenes, pubIds,
                                   variantCrossReferences,
                                   variantPhenotypeCount,
                                   COUNT(phenotype) AS allelePhenotypeCount
                              OPTIONAL MATCH (v:Variant)-[:ASSOCIATION]->(disease:DiseaseEntityJoin)
                              WITH v, a, s, c, p, st, assembly, geneConsequences, alleleAssociatedGenes, alleleSyns, variantSyns, variantAffectedGenes, pubIds,
                                   variantCrossReferences,
                                   variantPhenotypeCount,
                                   allelePhenotypeCount,
                                   COUNT(disease) AS variantDiseaseCount
                              OPTIONAL MATCH (a:Allele)-[:ASSOCIATION]->(disease:DiseaseEntityJoin)
                              WITH v, a, s, c, p, st, assembly, geneConsequences, alleleAssociatedGenes, alleleSyns, variantSyns, variantAffectedGenes, pubIds,
                                   variantCrossReferences,
                                   variantPhenotypeCount,
                                   allelePhenotypeCount,
                                   variantDiseaseCount,
                                   count(disease) AS alleleDiseaseCount
                              OPTIONAL MATCH (a:Allele)<-[:VARIATION]-(variant:Variant)
                              RETURN c.primaryKey AS chromosome,
                                     variantCrossReferences,
                                     variantPhenotypeCount,
                                     allelePhenotypeCount,
                                     variantDiseaseCount,
                                     alleleDiseaseCount,
                                     COUNT(DISTINCT variant) AS alleleVariantCount,
                                     v.globalId AS globalId,
                                     right(v.paddingLeft,1) AS paddingLeft,
                                     v.genomicReferenceSequence AS genomicReferenceSequence,
                                     v.genomicVariantSequence AS genomicVariantSequence,
                                     v.hgvsNomenclature AS hgvsNomenclature,
                                     v.dataProvider AS dataProvider,
                                     assembly.primaryKey AS assembly,
                                     alleleSyns,
                                     variantSyns,
                                     v.primaryKey as variantId,
                                     {id: st.primaryKey,
                                      name: st.name} AS variationType,
                                     {symbol: a.symbol,
                                      symbolText: a.symbolText,
                                      id: a.primaryKey} AS allele,
                                     pubIds,
                                     variantAffectedGenes,
                                     geneConsequences,
                                     alleleAssociatedGenes,
                                     p.start AS start,
                                     p.end AS end,
                                     s.name AS species,
                                     s.primaryKey AS taxonId,
                                     st.nameKey AS soTerm'''
    if config_info.config["DEBUG"]:
        logger.info(variant_allele_query)
        start_time = time.time()
        logger.info("Start time: %s",
                    time.strftime("%H:%M:%S", time.gmtime(start_time)))

    variant_allele_data_source = DataSource(get_neo_uri(config_info),
                                            variant_allele_query)
    gvaf = variant_allele_file_generator.VariantAlleleFileGenerator(
        variant_allele_data_source, generated_files_folder, config_info)
    gvaf.generate_files(species_id,
                        skip_chromosomes=skip_chromosomes,
                        upload_flag=upload_flag,
                        validate_flag=validate_flag)

    if config_info.config["DEBUG"]:
        end_time = time.time()
        logger.info("Created Variant Allele file %s - End time: %s",
                    species_id, time.strftime("%H:%M:%S",
                                              time.gmtime(end_time)))
        logger.info(
            "Time Elapsed: %s",
            time.strftime("%H:%M:%S", time.gmtime(end_time - start_time)))