"""
    Time Analysis of CDEs
    @author: Riccardo Miotto
"""

import argparse
from matplotlib.backends.backend_pdf import PdfPages
from pylab import *
import numpy as np
import matplotlib.pyplot as plt
from ctgov.utility.log import strd_logger
import ctgov.utility.file as ufile


log = strd_logger('cde-time-analysis')


def cde_analysis(ddata, dout, ystep=1):
    if ystep < 1:
        log.error('the year step needs to be greater than 1 -- interrupting')
        return

    dout = '%s/year-step-%d' % (dout, ystep)
    if not ufile.mkdir(dout):
        log.error('impossible to create the output directory - interrupting')
        return

    # get list of diseases
    ddata = '%s/year-step-%d' % (ddata, ystep)
    ldis = sorted(os.walk(ddata).next()[1])
Exemple #2
0
'''
 	Retrieve Disease - NCT associations starting from a list of diseases

 	@author: Riccardo Miotto
'''

from ctgov.utility.log import strd_logger
from ctgov.utility.web import download_web_data
from collections import defaultdict
import ctgov.index.es_index as es_index
import xml.etree.ElementTree as xml_parser
import ctgov.utility.file as ufile
import argparse, sys

log = strd_logger('disease-nct-association')


def mine_disease_to_nct(ldisease, fout=None, ctmin=100):
    url = 'http://clinicaltrials.gov/search?cond=%s&displayxml=true&count=%s'
    log.info('found %d disease to process \n' % len(ldisease))
    ldisease = sorted(map(lambda x: ' '.join(x.lower().split()), ldisease))
    nct_disease = defaultdict(list)
    c = 1
    for d in sorted(ldisease):
        log.info('processing: "%s"' % d)
        d = d.replace(',', '')
        fd = d.replace(' ', '+')

        # number of trials
        xmltree = xml_parser.fromstring(download_web_data(url % (fd, '0')))
        nres = xmltree.get('count')
Exemple #3
0
  @author: Riccardo Miotto <rm3086 (at) columbia (dot) edu>

  Modified by @author: Praveen Chandar
"""

from ctgov.utility.log import strd_logger
from ctgov.utility.web import clean_text
from ctgov.concept_mapping.filters import ConceptFilters
from ctgov.concept_mapping.dict_mapping import DictionaryMapping
from itertools import groupby
import math
import nltk
import string

log = strd_logger('concept-tagger')


class Tagger:
    # constructor
    def __init__(self, ngram=5, stop=None, umls=None, ptag=None):
        self.filter = ConceptFilters(ngram, stop, ptag)
        self.mapper = DictionaryMapping(umls)
        self.ngram = ngram

    def process_text(self, text):
        ptxt = self.process_section(text)
        return ptxt

    def process(self, ec_dict):
        pec = {}
Exemple #4
0
"""
    <Module Explanation>
    @author: Praveen Chandar
"""
from ctgov.utility.log import strd_logger
from datetime import datetime
from ctgov.utility.web import clean_text
import xml.etree.ElementTree as xml_parser
import math
import re

log = strd_logger('ctgov-parser')


class ClinicalTrial_Parser(object):
    def __init__(self, data_path):
        self.data_path = data_path

    def parse(self, nct_id):
        try:
            trail_path = self.data_path + '/' + nct_id + '.xml'
            xml = xml_parser.parse(trail_path)

            # general
            doc = {}
            doc['title'] = self.__get_info(xml, 'brief_title')
            doc['study_type'] = self.__get_info(xml, 'study_type')

            # Add conditions
            cond = xml.findall('condition')
            conditions = []
Exemple #5
0
"""
    The module contains functions to connect to the elastic search index.

    @author: Praveen Chandar
"""
from ctgov.utility.log import strd_logger
from elasticsearch import Elasticsearch
from elasticsearch_dsl import Search, Q
from elasticsearch import ConnectionError
import json

log = strd_logger('elasticsearch-index')


class ElasticSearch_Index(object):
    def __init__(self, index_name, host='localhost', port=9200):
        self.index_name = index_name
        self.host_name = host
        self.port_number = port
        self.doc_type = 'trial'

        self.es = self.get_es_conn()


    def get_es_conn(self):
        """
        Create an ElasticSearch() object

        :return: Elasticsearch() instance
        """
        assert isinstance(self.host_name, str)
Exemple #6
0
'''
 Extract relevant tags from a text

  @author: Riccardo Miotto <rm3086 (at) columbia (dot) edu>
'''

import nltk, string, itertools
from ctgov.utility.log import strd_logger
from ctgov.utility.web import clean_text

log = strd_logger('textprocesser')
conj = set(['and', 'or'])


class TextProcesser:
    # constructor
    def __init__(self, text, ngram=5, stop=None, umls=None, ptag=None):
        try:
            self.text = str(text)
        except UnicodeEncodeError:
            self.text = str(text.encode('utf-8'))
        self.text = self.text.lower().strip()
        self.text = self.text.replace('- ', ' ').replace(' -', ' ')

        # get filtering data
        self.ngr = ngram
        if not stop:
            self.stop = (set(), set())
        else:
            self.stop = stop
        self.umls = umls
Exemple #7
0
"""
    Retrieve Clinical Trials and Stores it onto a directory
    @author: Praveen Chandar
"""

from ctgov.utility.log import strd_logger
from multiprocessing import Process, Queue
import argparse, sys, math
import urllib2, urllib3, json
import os, shutil
import re

log = strd_logger('nct-processer')
# create directory (delete if one with the same name already exists)
def mkdir(dirname, force_create=False):
    try:
        os.makedirs(dirname)
    except OSError:
        if force_create:
            shutil.rmtree(dirname)
            os.makedirs(dirname)
        else:
            pass
    except Exception as e:
        log.error(e)
        return False
    return True


def download_web_data(url):
    try:
"""
    Clinical Trial representation
    @author: Riccardo Miotto
"""

from ctgov.concept_mapping.textprocesser import TextProcesser
from ctgov.concept_mapping.cvalue import substring_filtering
from ctgov.utility.web import download_web_data
from ctgov.utility.log import strd_logger
from datetime import datetime
import xml.etree.ElementTree as xml_parser
import math, re

log = strd_logger('clinical-trial')


class ClinicalTrial(object):
    def __init__(self, nctid, data_path):
        self.trail_path = data_path + '/' + nctid + '.xml'
        self.id = nctid
        self.title = None
        self.condition = set()
        self.study_type = None
        self.start_date = None
        self.firstreceived_date = None
        self.verification_date = None
        self.lastchanged_date = None
        self.completion_date = None
        self.gender = None
        self.minimum_age = None
        self.maximum_age = None
Exemple #9
0
'''
 	Mine CDEs from a collection of trials associated to a disease

  	@author: Riccardo Miotto
'''

from cvalue import substring_filtering
from ctgov.utility.log import strd_logger
import math, numpy, operator

log = strd_logger('cde')
'''
	mine the CDEs
'''


def cde_miner(pnct, tags, freq=0.01, umls=None):
    # mine CDEs
    cde = _mine_cde(pnct, freq, tags)
    log.info('------ retained %d CDEs' % len(cde))

    # assign cde to inclusion and exclusion
    ie_cde = {}
    for k, ct in pnct.iteritems():
        for it in ct.pec:
            itdict = ie_cde.setdefault(it, {})
            for t in ct.pec[it]:
                if t in cde:
                    v = itdict.setdefault(t, 0)
                    itdict[t] = v + 1
            ie_cde[it] = itdict
"""
    Retrieve Clinical Trials and Stores it onto a directory
    @author: Praveen Chandar
"""

from ctgov.utility.log import strd_logger
from multiprocessing import Process, Queue
import ctgov.utility.file as file_utils
import ctgov.index.es_index as es_index
import ctgov.index.ctgov_parser as ctgov_parser
import argparse, sys, math
import os

log = strd_logger('nct-indexer')


def nct_index(din, index_name, host='localhost', port_no=9200, nprocs=1, settings_file=None):
    # open the clinical trail ids file and load to a list
    log.info('opening file -- trial_ids.txt')

    nct_ids = []
    for line in open(din + '/trial_ids.txt', 'rb'):
        nct_ids.append(line.strip())


    # Check directories
    trials_din = din + '/trials_xml/'
    if not os.path.exists(trials_din):
        log.error('trials_xml directory does not exists in %s \n' % din)
        exit(0)
Exemple #11
0
"""
 Function to Interact with ClinicalTrials.gov

 @author: Riccardo Miotto <rm3086 (at) columbia (dot) edu>

 Modified on Sep 15th 2014
 @author: Praveen Chandar < (at) columbia (dot) edu >
"""
import re
from ctgov.utility.web import download_web_data
from ctgov.utility.log import strd_logger


log = strd_logger('ctgov-fetch')


def get_clinical_trials():
    """
    Obtains the latest list of all clinical trials from clinicaltrails.gov

    :return:
    """
    url = 'http://clinicaltrials.gov/ct2/crawl'
    html = download_web_data(url)
    pages = re.findall(r'href="/ct2/crawl/(\d+)"', html)
    lnct = set()
    for p in pages:
        html = download_web_data('%s/%s' % (url, p))
        ct = re.findall(r'href="/ct2/show/(NCT\d+)"', html)
        lnct |= set(ct)
    return sorted(lnct)
Exemple #12
0
  @author: Riccardo Miotto <rm3086 (at) columbia (dot) edu>

  Modified by @author: Praveen Chandar
"""

from ctgov.utility.log import strd_logger
from ctgov.utility.web import clean_text
from ctgov.concept_mapping.filters import ConceptFilters
from ctgov.concept_mapping.dict_mapping import DictionaryMapping
from itertools import groupby
import math
import nltk
import string


log = strd_logger('concept-tagger')


class Tagger:
    # constructor
    def __init__(self, ngram=5, stop=None, umls=None, ptag=None):
        self.filter = ConceptFilters(ngram, stop, ptag)
        self.mapper = DictionaryMapping(umls)
        self.ngram = ngram

    def process_text(self, text):
        ptxt = self.process_section(text)
        return ptxt

    def process(self, ec_dict):
        pec = {}
Exemple #13
0
"""
    Mine CDEs for a set of disease
    @param yearstep: step for time analysis

    @author: Riccardo Miotto
"""

from ctgov.utility.log import strd_logger
from ctgov.miner.cde import cde_miner
from ctgov.load_data import load_umls
from datetime import timedelta
import ctgov.utility.file as ufile
import argparse, sys, math, datetime, zipfile, os, shutil

log = strd_logger('cde-concept_mapping')


def mining_cde(nct, disease, nctmin=100, fth=0.03, umls=None, dout=None, yearstep=-1):
    if yearstep <= 0:
        yearstep = -1

    # get year interval
    yi = _year_interval(yearstep)

    # check output directory
    if not _check_dout(dout):
        return
    if yearstep == -1:
        dout = '%s/all-years' % dout
    else:
        dout = '%s/year-step-%d' % (dout, yearstep)
"""
    <Module Explanation>
    @author: Praveen Chandar
"""
import itertools
from ctgov.utility.log import strd_logger

log = strd_logger('dict-mapping')


class DictionaryMapping(object):
    def __init__(self, umls):
        self.use_scramble_find = True
        self.use_split_dashed_words = True
        self.conj = {'and', 'or'}
        self.umls = umls

    def map(self, tokens):
        if not self.umls:
            log.warning('UMLS not loaded')
            return []

        # First do direct mapping
        tags = self._direct_mapping(tokens)

        # If simple direct mapping fails, try other options
        if tags is None and self.use_scramble_find:
            tags = self._scramble_find(tokens)

        # If scrambling fails, look for dashed words
        if tags is None and self.use_split_dashed_words:
"""
    Time Analysis of CDEs
    @author: Riccardo Miotto
"""

import argparse
from matplotlib.backends.backend_pdf import PdfPages
from pylab import *
import numpy as np
import matplotlib.pyplot as plt
from ctgov.utility.log import strd_logger
import ctgov.utility.file as ufile

log = strd_logger('cde-time-analysis')


def cde_analysis(ddata, dout, ystep=1):
    if ystep < 1:
        log.error('the year step needs to be greater than 1 -- interrupting')
        return

    dout = '%s/year-step-%d' % (dout, ystep)
    if not ufile.mkdir(dout):
        log.error('impossible to create the output directory - interrupting')
        return

    # get list of diseases
    ddata = '%s/year-step-%d' % (ddata, ystep)
    ldis = sorted(os.walk(ddata).next()[1])

    yinterval = _year_interval(ystep)
Exemple #16
0
"""
    Mine CDEs for a set of disease
    @param yearstep: step for time analysis

    @author: Riccardo Miotto
"""

from ctgov.utility.log import strd_logger
from ctgov.miner.cde import cde_miner
from ctgov.load_data import load_umls
from datetime import timedelta
import ctgov.utility.file as ufile
import argparse, sys, math, datetime, zipfile, os, shutil

log = strd_logger('cde-concept_mapping')


def mining_cde(nct,
               disease,
               nctmin=100,
               fth=0.03,
               umls=None,
               dout=None,
               yearstep=-1):
    if yearstep <= 0:
        yearstep = -1

    # get year interval
    yi = _year_interval(yearstep)

    # check output directory
Exemple #17
0
"""
    Retrieve Clinical Trials and Stores it onto a directory
    @author: Praveen Chandar
"""

from ctgov.utility.log import strd_logger
from multiprocessing import Process, Queue
import ctgov.utility.file as file_utils
import ctgov.index.es_index as es_index
import ctgov.index.ctgov_parser as ctgov_parser
import argparse, sys, math
import os

log = strd_logger('nct-indexer')


def nct_index(din,
              index_name,
              host='localhost',
              port_no=9200,
              nprocs=1,
              settings_file=None):
    # open the clinical trail ids file and load to a list
    log.info('opening file -- trial_ids.txt')

    nct_ids = []
    for line in open(din + '/trial_ids.txt', 'rb'):
        nct_ids.append(line.strip())

    # Check directories
    trials_din = din + '/trials_xml/'
Exemple #18
0
"""
    Apply tagging process
    @author: Praveen Chandar
"""

from ctgov.load_data import load_data
from ctgov.utility.log import strd_logger
from multiprocessing import Process
import ctgov.index.es_index as es_index
from ctgov.concept_mapping.tagger import Tagger
import argparse
import sys
import math

log = strd_logger('tag-miner')


def nct_tagging(index_name,
                host,
                port_no,
                process_ids,
                stopwords,
                umls,
                pos,
                nprocs=1):

    # open the clinical trail ids file to process
    nct_ids = []
    for line in open(process_ids, 'rb'):
        nct_ids.append(line.strip())
Exemple #19
0
'''
 	Mine CDEs from a collection of trials associated to a disease

  	@author: Riccardo Miotto
'''

from cvalue import substring_filtering
from ctgov.utility.log import strd_logger
import math, numpy, operator

log = strd_logger('cde')

'''
	mine the CDEs
'''


def cde_miner(pnct, tags, freq=0.01, umls=None):
    # mine CDEs
    cde = _mine_cde(pnct, freq, tags)
    log.info('------ retained %d CDEs' % len(cde))

    # assign cde to inclusion and exclusion
    ie_cde = {}
    for k, ct in pnct.iteritems():
        for it in ct.pec:
            itdict = ie_cde.setdefault(it, {})
            for t in ct.pec[it]:
                if t in cde:
                    v = itdict.setdefault(t, 0)
                    itdict[t] = v + 1
'''
 	Retrieve Disease - NCT associations starting from a list of diseases

 	@author: Riccardo Miotto
'''

from ctgov.utility.log import strd_logger
from ctgov.utility.web import download_web_data
from collections import defaultdict
import ctgov.index.es_index as es_index
import xml.etree.ElementTree as xml_parser
import ctgov.utility.file as ufile
import argparse, sys

log = strd_logger('disease-nct-association')


def mine_disease_to_nct(ldisease, fout=None, ctmin=100):
    url = 'http://clinicaltrials.gov/search?cond=%s&displayxml=true&count=%s'
    log.info('found %d disease to process \n' % len(ldisease))
    ldisease = sorted(map(lambda x: ' '.join(x.lower().split()), ldisease))
    nct_disease = defaultdict(list)
    c = 1
    for d in sorted(ldisease):
        log.info('processing: "%s"' % d)
        d = d.replace(',', '')
        fd = d.replace(' ', '+')

        # number of trials
        xmltree = xml_parser.fromstring(download_web_data(url % (fd, '0')))
        nres = xmltree.get('count')
'''
 	Retrieve and Process Clinical Trials (extract n-grams from eligibility criteria)
 	
 	@author: Riccardo Miotto
'''

from ctgov.utility.log import strd_logger
from ctgov.load_data import load_data
from multiprocessing import Process, Queue
from ctgov.index._clinicaltrial import ClinicalTrial
import ctgov.utility.file as ufile
import ctgov.index as ctgov
import argparse, sys, math

log = strd_logger('nct-processer')


def nct_processer(dout, stop=None, umls=None, ptag=None, nprocs=1):
    # get the list of clinical trials
    log.info('downloading the list of clinical trials')
    nct = index.get_clinical_trials()
    if len(nct) == 0:
        log.error(' --- not found any clinical trials - interrupting \n')
        return
    log.info(' --- found %d clinical trials \n' % len(nct))

    # process each clinical trial
    log.info('processing clinical trials')
    qout = Queue()
    procs = []
    chunksize = int(math.ceil(len(nct) / float(nprocs)))
Exemple #22
0
"""
 Function to Interact with ClinicalTrials.gov

 @author: Riccardo Miotto <rm3086 (at) columbia (dot) edu>

 Modified on Sep 15th 2014
 @author: Praveen Chandar < (at) columbia (dot) edu >
"""
import re
from ctgov.utility.web import download_web_data
from ctgov.utility.log import strd_logger

log = strd_logger('ctgov-fetch')


def get_clinical_trials():
    """
    Obtains the latest list of all clinical trials from clinicaltrails.gov

    :return:
    """
    url = 'http://clinicaltrials.gov/ct2/crawl'
    html = download_web_data(url)
    pages = re.findall(r'href="/ct2/crawl/(\d+)"', html)
    lnct = set()
    for p in pages:
        html = download_web_data('%s/%s' % (url, p))
        ct = re.findall(r'href="/ct2/show/(NCT\d+)"', html)
        lnct |= set(ct)
    return sorted(lnct)
Exemple #23
0
"""
    The module contains functions to connect to the elastic search index.

    @author: Praveen Chandar
"""
from ctgov.utility.log import strd_logger
from elasticsearch import Elasticsearch
from elasticsearch_dsl import Search, Q
from elasticsearch import ConnectionError
import json

log = strd_logger('elasticsearch-index')


class ElasticSearch_Index(object):
    def __init__(self, index_name, host='localhost', port=9200):
        self.index_name = index_name
        self.host_name = host
        self.port_number = port
        self.doc_type = 'trial'

        self.es = self.get_es_conn()

    def get_es_conn(self):
        """
        Create an ElasticSearch() object

        :return: Elasticsearch() instance
        """
        assert isinstance(self.host_name, str)
"""
    Apply tagging process
    @author: Praveen Chandar
"""

from ctgov.load_data import load_data
from ctgov.utility.log import strd_logger
from multiprocessing import Process
import ctgov.index.es_index as es_index
from ctgov.concept_mapping.tagger import Tagger
import argparse
import sys
import math


log = strd_logger('tag-miner')


def nct_tagging(index_name, host, port_no, process_ids,
                stopwords, umls, pos, nprocs=1):

    # open the clinical trail ids file to process
    nct_ids = []
    for line in open(process_ids, 'rb'):
        nct_ids.append(line.strip())

    # Check if index exists
    index = es_index.ElasticSearch_Index(index_name, host=host, port=port_no)
    index.add_field('ec_tags_umls', term_vector=True)

    # Get clinical
"""
    <Module Explanation>
    @author: Praveen Chandar
"""
from ctgov.utility.log import strd_logger
from datetime import datetime
from ctgov.utility.web import clean_text
import xml.etree.ElementTree as xml_parser
import math
import re

log = strd_logger('ctgov-parser')


class ClinicalTrial_Parser(object):
    def __init__(self, data_path):
        self.data_path = data_path

    def parse(self, nct_id):
        try:
            trail_path = self.data_path + '/' + nct_id + '.xml'
            xml = xml_parser.parse(trail_path)

            # general
            doc = {}
            doc['title'] = self.__get_info(xml, 'brief_title')
            doc['study_type'] = self.__get_info(xml, 'study_type')

            # Add conditions
            cond = xml.findall('condition')
            conditions = []