Example #1
0
def dump(files: iter, output_dir: str, unique: bool, update_all: bool):
    """
    Parse MEDLINE XML files into tabular flat-files for each DB table.

    In addition, a ``delete.txt`` file is generated, containing the PMIDs
    that should first be deleted from the DB before copying the dump.

    :param files: a list of XML files to parse (optionally, gzipped)
    :param output_dir: path to the output directory for the dump
    :param unique: if ``True`` only VersionId == "1" records are dumped
    :param update_all: if ``True`` the PMIDs of all parsed records are
                   added to the list of PMIDs for deletion
    """
    out_stream = {
        Citation.__tablename__:
        open(join(output_dir, "citations.tab"), "wt"),
        Abstract.__tablename__:
        open(join(output_dir, "abstracts.tab"), "wt"),
        Section.__tablename__:
        open(join(output_dir, "sections.tab"), "wt"),
        Descriptor.__tablename__:
        open(join(output_dir, "descriptors.tab"), "wt"),
        Qualifier.__tablename__:
        open(join(output_dir, "qualifiers.tab"), "wt"),
        Author.__tablename__:
        open(join(output_dir, "authors.tab"), "wt"),
        Identifier.__tablename__:
        open(join(output_dir, "identifiers.tab"), "wt"),
        Database.__tablename__:
        open(join(output_dir, "databases.tab"), "wt"),
        PublicationType.__tablename__:
        open(join(output_dir, "publication_types.tab"), "wt"),
        Chemical.__tablename__:
        open(join(output_dir, "chemicals.tab"), "wt"),
        Keyword.__tablename__:
        open(join(output_dir, "keywords.tab"), "wt"),
        'delete':
        open(join(output_dir, "delete.txt"), "wt"),
    }
    count = 0
    parser = MedlineXMLParser(unique)

    for f in files:
        logger.info('dumping %s', f)

        if f.lower().endswith('.gz'):
            in_stream = gunzip(f, 'rb')
        else:
            in_stream = open(f)

        count += _dump(in_stream, out_stream, parser, update_all)

    for stream in out_stream.values():
        if stream.tell() == 0:
            stream.close()
            remove(join(output_dir, stream.name))
        else:
            stream.close()

    logger.info("parsed %i records", count)
Example #2
0
File: crud.py Project: henn/medic
def dump(files: iter, output_dir: str, unique: bool, update_all: bool):
    """
    Parse MEDLINE XML files into tabular flat-files for each DB table.

    In addition, a ``delete.txt`` file is generated, containing the PMIDs
    that should first be deleted from the DB before copying the dump.

    :param files: a list of XML files to parse (optionally, gzipped)
    :param output_dir: path to the output directory for the dump
    :param unique: if ``True`` only VersionId == "1" records are dumped
    :param update_all: if ``True`` the PMIDs of all parsed records are
                   added to the list of PMIDs for deletion
    """
    out_stream = {
        Citation.__tablename__:
        open(join(output_dir, "citations.tab"), "wt"),
        Abstract.__tablename__:
        open(join(output_dir, "abstracts.tab"), "wt"),
        Section.__tablename__:
        open(join(output_dir, "sections.tab"), "wt"),
        Descriptor.__tablename__:
        open(join(output_dir, "descriptors.tab"), "wt"),
        Qualifier.__tablename__:
        open(join(output_dir, "qualifiers.tab"), "wt"),
        Author.__tablename__:
        open(join(output_dir, "authors.tab"), "wt"),
        Identifier.__tablename__:
        open(join(output_dir, "identifiers.tab"), "wt"),
        Database.__tablename__:
        open(join(output_dir, "databases.tab"), "wt"),
        PublicationType.__tablename__:
        open(join(output_dir, "publication_types.tab"), "wt"),
        Chemical.__tablename__:
        open(join(output_dir, "chemicals.tab"), "wt"),
        Keyword.__tablename__:
        open(join(output_dir, "keywords.tab"), "wt"),
        'delete': open(join(output_dir, "delete.txt"), "wt"),
    }
    count = 0
    parser = MedlineXMLParser(unique)

    for f in files:
        logger.info('dumping %s', f)

        if f.lower().endswith('.gz'):
            in_stream = gunzip(f, 'rb')
        else:
            in_stream = open(f)

        count += _dump(in_stream, out_stream, parser, update_all)

    for stream in out_stream.values():
        if stream.tell() == 0:
            stream.close()
            remove(join(output_dir, stream.name))
        else:
            stream.close()

    logger.info("parsed %i records", count)
Example #3
0
def _openFile(name):
    if name.lower().endswith('.gz'):
        # use wrapper to support pre-3.3
        return gunzip(name, 'rb')
    else:
        return open(name)
Example #4
0
def run_module():
    # seed the result dict in the object
    result = dict(
        changed=True,
    )

    # define available arguments/parameters a user can pass to the module
    module = AnsibleModule(
        argument_spec=dict(
            state=dict(
                type='str',
                default='enabled',
                required=False,
                choices=[
                    'present',
                    'absent',
                    'enabled',
                    'disabled'
                ]
            ),
            name=dict(
                type='str',
                required=True
            ),
            kodi_user=dict(
                type='str',
                required=False,
                default='kodi'
            ),
            kodi_home=dict(
                type='str',
                required=False,
                default=''
            ),
            kodi_release=dict(
                type='str',
                required=True
            ),
        ),
        supports_check_mode=True
    )

    # We only work with releases that have the database version 27
    # See: https://kodi.wiki/view/Databases#Database_Versions
    if not SUPPORTED_RELEASES.__contains__(module.params['kodi_release']):
        module.fail_json(
            msg='Unsupported kodi release. Supported: %s' % (SUPPORTED_RELEASES),
            **result
        )

    # Seed the default kodi home if not supplied
    if module.params['kodi_home']:
        kodi_home = module.params['kodi_home']
    else:
        kodi_home = "%s/.kodi" % (getpwnam(module.params['kodi_user']).pw_dir)

    # Seed the default kodi repository
    kodi_repo = 'http://mirrors.kodi.tv/addons/%s/addons.xml.gz' % (module.params['kodi_release'])

    # Shortcut for the actual repository, computed removing 'addons.xml(.gz)'
    repo_base = '/'.join(kodi_repo.split('/')[0:-1])

    if module.params['state'] == 'absent':

        if module.check_mode:
            # We would only remove the addon if it's in the directory,
            #   or if it has traces in the db
            result['changed'] = (
                exists("%s/addons/%s" % (kodi_home, module.params['name']))
                or is_in_db(module.params['name'], kodi_home)
            )
            module.exit_json(**result)

        result['changed'] = remove_addon(kodi_home, module.params['name'])
        module.exit_json(**result)

    elif module.params['state'] == 'disabled':
        enabled = False
    elif module.params['state'] in ['present', 'enabled']:
        enabled = True

    # If the directory already exists and the enabled status is as desired, then we have nothing to do
    if exists("%s/addons/%s" % (kodi_home, module.params['name'])):
        if is_enabled(module.params['name'], kodi_home) == enabled:
            result['changed'] = False
            module.exit_json(**result)

    addons_xml = mktemp()

    # Download the repository definition, uncompressing it if necessary
    # This check is left from an attempt to support multiple repositories which might be retried, no need to delete it.
    if kodi_repo.split('.')[-1] == 'gz':
        addons_xml_gz = mktemp()
        download(kodi_repo, addons_xml_gz.name)
        addons_xml.seek(0)
        addons_xml.truncate()
        addons_xml.write(gunzip(addons_xml_gz.name).read())
        addons_xml.flush()
    else:
        download(kodi_repo, addons_xml.name)

    # Load the actual xml
    addons = etree.parse(addons_xml.name)

    install_addon(repo_base, addons, module.params['name'], module.params['kodi_user'], kodi_home, module.params['kodi_release'], enabled)

    result['changed'] = True
    module.exit_json(**result)
Example #5
0
def _openFile(name):
    if name.lower().endswith('.gz'):
        # use wrapper to support pre-3.3
        return gunzip(name, 'rb')
    else:
        return open(name)
Example #6
0
#!/usr/bin/env python3

from collections import Counter
import csv
from gzip import open as gunzip
import json
import string
import re

separators = '[{}]'.format(string.punctuation + string.whitespace)
queries = set()

for i in range(10):
    filename = 'data/aol/aol-{}.txt.gz'.format(str(i + 1).zfill(2))
    with gunzip(filename, 'rt') as file:
        reader = csv.reader(file, delimiter='\t')
        _header = next(reader)
        queries |= set(line[1] for line in reader)

data = Counter(item for query in queries
               for item in re.split(separators, query))

data = dict(data)

# filter out stopwords
with open('data/stopwords.txt') as f:
    for word in f:
        word = word.strip()
        if word in data:
            del data[word]
                    type=float,
                    metavar='min. proportion alt alleles',
                    required=True,
                    help='minimum proportion of alternative alleles to allow')
parser.add_argument(
    '-ly',
    type=str,
    metavar='lyrata_only?',
    required=False,
    default='false',
    help='do you want to include lyrata only (true) or not (false)?')

args = parser.parse_args()

if args.gz == 'true' and args.v[-3:] == '.gz':
    gzip.gunzip(args.v)
    lookup_table_file = open(
        args.v + args.o + "repolarized.lookupKey.minAlleles_" + str(args.mi) +
        ".txt", 'w')

lookup_table_file = open(
    args.o + "repolarized.lookupKey.minInd_" + str(args.mi) + ".txt", 'w')

if args.ly == 'true':
    args.mi = 2  # args.mi must = 2, since there are only two lyrata samples
    args.mp = 1.0

count = 0
type_counts = [0, 0, 0, 0]
count_file = open(args.o + "counts.txt", 'w')