예제 #1
0
    def test_load_repertoire(self):
        # Good data
        try:
            data = airr.load_repertoire(self.rep_good, validate=True)
        except:
            self.assertTrue(False, 'load_repertoire(): good data failed')

        # Bad data
        try:
            data = airr.load_repertoire(self.rep_good, validate=True)
            self.assertFalse(True, 'load_repertoire(): bad data failed')
        except:
            pass
예제 #2
0
    def test_load_repertoire(self):
        # Good data
        try:
            data = airr.load_repertoire(self.rep_good, validate=True)
        except:
            self.assertTrue(False, 'load_repertoire(): good data failed')

        # Bad data
        try:
            data = airr.load_repertoire(self.rep_bad,
                                        validate=True,
                                        debug=True)
            self.assertFalse(True, 'load_repertoire(): bad data failed')
        except ValidationError:
            pass
        except Exception as inst:
            print(type(inst))
            raise inst
    print("ELAPSED DOWNLOAD TIME (in hours): %s" % (total_time / 3600))

    filename = str(query_files.split("/")[-1].split(".")[0]) + "_" + str(
        study_id) + "__OUT.json"
    json_data = parse_query(
        query_json,
        str(details_dir) + str(query_files.split("/")[-1].split(".")[0]) +
        "_" + str(study_id) + "_")

    #     # Uncomment when AIRR test is ready to be used again
    if entry_pt == "repertoire":

        print("In repertoire entry point", entry_pt)

        try:
            airr.load_repertoire(str(details_dir) + filename, validate=True)
            print("Successful repertoire loading - AIRR test passed\n")
        except airr.ValidationError as err:
            print("ERROR: AIRR repertoire validation failed for file %s - %s" %
                  (filename, err))
            print("\n")
        print(
            "---------------------------------------------------------------------------------------------------------------------------------------------------"
        )

    #Begin sanity checking
    print(
        "########################################################################################################"
    )
    print(
        "---------------------------------------VERIFY FILES ARE HEALTHY-----------------------------------------\n"
예제 #4
0
import matplotlib
matplotlib.use('Agg')

import matplotlib.pyplot as plt
import numpy as np

# We have 4 T cell subsets
subsets = {
    'CL_0000895': [0 for number in range(0, 50)],
    'CL_0000900': [0 for number in range(0, 50)],
    'CL_0000897': [0 for number in range(0, 50)],
    'CL_0000909': [0 for number in range(0, 50)]
}

# Load the repertoire metadata
data = airr.load_repertoire('repertoires.airr.json')
repertoires = {obj['repertoire_id']: obj for obj in data['Repertoire']}

# Iterate through the rearrangement data and tabulate the counts
reader = airr.read_rearrangement('rearrangements.tsv')
for row in reader:
    # get the appropriate repertoire
    rep = repertoires[row['repertoire_id']]
    # use the cell_subset field in the repertoire
    c = subsets[rep['sample'][0]['cell_subset']['id']]
    # increment the length count
    if row['junction_aa_length']:
        if int(row['junction_aa_length']) >= 50:
            continue
        #print(int(row['junction_aa_length']))
        c[int(row['junction_aa_length'])] += 1
        cfg['password'] = os.getenv('MONGODB_SECRET')
        return cfg
    else:
        print('ERROR: loading config')
        return None


# connection header
config = getConfig()
header = 'var conn = new Mongo();\n'
header += 'var db = conn.getDB("admin");\n'
header += 'db.auth("' + config['service_user'] + '", "' + config[
    'service_secret'] + '");\n'
header += 'db = db.getSiblingDB("' + config['db'] + '");\n'

os.system("mkdir /work_data/tmp")
fname = '/work_data/tmp/repertoire.js'
print('Creating file: ' + fname)
fout = open(fname, 'w')
fout.write(header)

# TODO: This should use the AIRR python load_repertoire()
data = airr.load_repertoire('/work/florian.airr.yaml')
reps = data['Repertoire']

for r in reps:
    fout.write('db.repertoire.deleteOne({"repertoire_id":"' +
               r['repertoire_id'] + '"});\n')
    fout.write('db.repertoire.insertOne(' + json.dumps(r) + ');\n')
fout.close()
예제 #6
0
def airrdownload(args):
    airr.validate_repertoire(args.repertoire, True)
    repertoire_file = args.repertoire
    rearrangements_file = repertoire_file[:-4] + "rearrangements.tsv"
    try:
        data = airr.load_repertoire(args.repertoire)
    except TypeError:
        sys.stderr.write("TCRcloud error: It seems you did not indicate a \
properly formatted AIRR rearrangements file\n")
        exit()
    repertoires = data["Repertoire"]
    host_url = testserver(data)

    # Print out some Info
    print("       Info: " + data["Info"]["title"])
    print("    version: " + str(data["Info"]["version"]))
    print("description: " + data["Info"]["description"])
    print("Found " + str(len(data["Repertoire"])) + " repertoires in \
repertoire metadata file.")

    # Query the rearrangement endpoint
    # Define a generic query object, and we will replace the repertoire_id
    # within the loop. We also only request productive rearrangements as
    # an additional filter.

    query = {
        "filters": {
            "op": "and",
            "content": [
                {
                    "op": "=",
                    "content": {
                        "field": "repertoire_id",
                        "value": "XXX"
                    }
                },
                {
                    "op": "=",
                    "content": {
                        "field": "productive",
                        "value": True
                    }
                }
            ]
        },
        "size": 1000,
        "from": 0
    }

    # Loop through each repertoire and query rearrangement data for
    # each. We download in chunks of 10000 because of the server
    # limitations using the from and size parameters.

    first = True
    for r in repertoires:
        print("Retrieving rearrangements for repertoire: "
              + r["repertoire_id"])
        query["filters"]["content"][0]["content"]["value"] = r["repertoire_id"]
        query["size"] = 1000
        query["from"] = 0

        cnt = 0
        while True:
            # send the request
            resp = requests.post(host_url + "/rearrangement", json=query)
            data = resp.json()
            rearrangements = data["Rearrangement"]

            # Open a file for writing the rearrangements. We do this here
            # because we need to know the full set of fields being
            # returned from the data repository, otherwise by default only
            # the required fields will be written to the file.
            if first:
                out_file = airr.create_rearrangement(
                    rearrangements_file,
                    fields=rearrangements[0].keys())
                first = False

            # save the rearrangements to a file
            for row in rearrangements:
                out_file.write(row)

            # looping until zero rearrangements are returned from the query.
            cnt += len(rearrangements)
            if len(rearrangements) < 1000:
                break

            # Need to update the from parameter to get the next chunk
            query["from"] = cnt

        print("Retrieved " + str(cnt) + " rearrangements for repertoire: "
                           + r["repertoire_id"])
    print("Saved as " + rearrangements_file)
예제 #7
0
    def process(self, filename):

        # Check to see if we have a file
        if not os.path.isfile(filename):
            print("ERROR: input file " + filename + " is not a file")
            return False

        # Get the column tag for the iReceptor mapping
        ireceptor_tag = self.getiReceptorTag()

        # Get the column tag for the iReceptor mapping
        repository_tag = self.getRepositoryTag()

        # Check the validity of the repertoires from an AIRR perspective
        try:
            data = airr.load_repertoire(filename, validate=True)
        except airr.ValidationError as err:
            print("ERROR: AIRR repertoire validation failed for file %s - %s" %
                  (filename, err))
            return False
        except Exception as err:
            print("ERROR: AIRR repertoire validation failed for file %s - %s" %
                  (filename, err))
            return False

        # Get the fields to use for finding repertoire IDs, either using those IDs
        # directly or by looking for a repertoire ID based on a rearrangement file
        # name.
        repertoire_id_field = self.getRepertoireLinkIDField()
        rearrangement_file_field = self.getRearrangementFileField()

        # The 'Repertoire' contains a dictionary for each repertoire.
        repertoire_list = []
        for repertoire in data['Repertoire']:
            repertoire_dict = dict()
            for key, value in repertoire.items():
                try:
                    self.ir_flatten(key, value, repertoire_dict)
                except TypeError as error:
                    print("ERROR: %s" % (error))
                    return False

            # Ensure that we have a correct file name to link fields. If we can't find it
            # this is a fatal error as we can not link any data to this set repertoire,
            # so there is no point adding the repertoire...
            repository_file_field = self.getAIRRMap().getMapping(
                rearrangement_file_field, ireceptor_tag, repository_tag)
            # If we can't find a mapping for this field in the repository mapping, then
            # we might still be OK if the metadata spreadsheet has the field. If the fails,
            # then we should exit.
            if repository_file_field is None or len(
                    repository_file_field) == 0:
                print(
                    "Warning: No repository mapping for the rearrangement file field (%s)"
                    % (rearrangement_file_field))
                repository_file_field = rearrangement_file_field

            # If we can't find the file field for the rearrangement field in the repository, then
            # abort, as we won't be able to link the repertoire to the rearrangement.
            if not repository_file_field in repertoire_dict:
                print(
                    "ERROR: Could not find a rearrangement file field in the metadata (%)"
                    % (rearrangement_file_field))
                print(
                    "ERROR: Will not be able to link repertoire to rearrangement annotations"
                )
                return False

            repertoire_list.append(repertoire_dict)

        # Iterate over the list and load records. Note that this code inserts all data
        # that was read in. That is, all of the non MiAIRR fileds that exist
        # are stored in the repository. So if the provided file has lots of extra fields
        # they will exist in the repository.
        # TODO: Ensure that all records are written as the correct type for the repository.
        for r in repertoire_list:
            if self.repositoryInsertRepertoire(r) is None:
                return False

        # If we made it here we are DONE!
        return True