Beispiel #1
0
    def extract_nlp(self, text):
        digest_data = 'nlptags.cache_' + \
            md5(text.encode('ascii', 'ignore')).hexdigest()

        if not os.path.exists(self.cachedir):
            print >> sys.stderr, '[cache error] directory %s does not exists' % self.cachedir
            try:
                print '[cache info] Creating caches directory'
                os.makedirs(self.cachedir)
            except:
                print >> sys.stderr, '[cache error] Failed to create caches directory'
                sys.exit(1)
        cache_path = os.path.join(self.cachedir, digest_data)
        if os.path.exists(cache_path):
            with codecs.open(cache_path, mode='rb', encoding='utf-8') as f:
                return json.load(f)
        else:
            nlptags = []
            words = []
            parsed = simplejson.loads(self.server.parse(text))
            for st in parsed['sentences']:
                nlptags.append(self.parse_parsetree(st['parsetree']))
                words.append(st['words'])
            out = prep_for_json({'parsetree': nlptags, 'words': words})
            with codecs.open(cache_path, mode='wb', encoding='utf-8') as f:
                json.dump(out, f)
            return out
Beispiel #2
0
    def extract_nlp(self, text):
        digest_data = 'nlptags.cache_' + \
            md5(text.encode('ascii', 'ignore')).hexdigest()

        if not os.path.exists(self.cachedir):
            print >> sys.stderr, '[cache error] directory %s does not exists' % self.cachedir
            try:
                print '[cache info] Creating caches directory'
                os.makedirs(self.cachedir)
            except:
                print >> sys.stderr, '[cache error] Failed to create caches directory'
                sys.exit(1)
        cache_path = os.path.join(self.cachedir, digest_data)
        if os.path.exists(cache_path):
            with codecs.open(cache_path, mode='rb', encoding='utf-8') as f:
                return json.load(f)
        else:
            nlptags = []
            words = []
            parsed = simplejson.loads(self.server.parse(text))
            for st in parsed['sentences']:
                nlptags.append(self.parse_parsetree(st['parsetree']))
                words.append(st['words'])
            out = prep_for_json({'parsetree': nlptags, 'words': words})
            with codecs.open(cache_path, mode='wb', encoding='utf-8') as f:
                json.dump(out, f)
            return out
Beispiel #3
0
    def extract_nlp_batch(self, input_list):
        """
        Extract NLP information from `input_list`

        Returns a <dict> {`sentences<list>`: nlp info,
                          `corefs<list>`: coreference info}
                      `sentences` is a list of nlp info corresponding to entries in `input_list`
                       See method *parse* for more info
        """
        digest_data = 'nlptags_batch.cache_' + \
            md5(str(input_list).encode('ascii', 'ignore')).hexdigest()

        if not os.path.exists(self.cachedir):
            print >> sys.stderr, '[cache error] directory %s does not exists' % self.cachedir
            try:
                print '[cache info] Creating caches directory'
                os.makedirs(self.cachedir)
            except:
                print >> sys.stderr, '[cache error] Failed to create caches directory'
                sys.exit(1)
        cache_path = os.path.join(self.cachedir, digest_data)
        if os.path.exists(cache_path):
            with codecs.open(cache_path, mode='rb', encoding='utf-8') as f:
                return json.load(f)
        else:
            nlptags = []
            corefs = []
            for i in input_list:
                parsed = self.parse(i)
                nlptags.append(parsed['sentences'])
                corefs.append(parsed['coref'])
            res = prep_for_json({'sentences': nlptags, 'corefs': corefs})
            with codecs.open(cache_path, mode='wb', encoding='utf-8') as f:
                json.dump(out, f)
            return res
Beispiel #4
0
def batch_filtered(data, semtypes, fields=None, cachedir='../cache',
                   mmjar=MMJAR_PATH, no_cache=False,
                   long_concepts=True):
    """ exactly like batch with the following difference:
        It returns only concepts and their semantic types
        and only for semantic types that are indicated in semtypes
        the output format is a list of concepts with only concept name
         and its semantic type
        default semtypes path is: data/semantic_types.json
    """

    if not no_cache:
        digest_data = ('batch_mm_filtered_{0}.cache'
                       ''.format(md5(json.dumps([fields, data])).hexdigest()))
        if not os.path.exists(cachedir):
            print >> sys.stderr, '[cache error] %s does not exists' % cachedir
            sys.exit(1)
        cache_path = os.path.join(cachedir, digest_data)
        if os.path.exists(cache_path):
            with codecs.open(cache_path, mode='rb', encoding='utf-8') as f:
                return json.load(f)

    out = []
    for elem in data:
        # if fields is not specified, then elem_fields == elem.keys()
        # otherwise list comprehension acts like a filter function
        elem_fields = [k for k in elem.keys()
                       if (not(fields) or (k in fields))]

        result = {fl: run(elem[fl],
                          no_cache=True,
                          mmjar=mmjar,
                          long_concepts=long_concepts)
                  for fl in elem_fields}

        found_concepts = {}
        for fl in elem_fields:
            found_concepts[fl] = []
            for concept in result[fl]['txt']['concepts']:
                if str(concept['semtype'][0]) in semtypes:
                    found_concept = {}
                    found_concept['cname'] = str(concept['cname'])
                    for t in concept['semtype']:
                        found_concept['ctype'] = t
                    found_concepts[fl].append(found_concept)

        out.append(found_concepts)

    if not no_cache:
        out = prep_for_json(out)
        print out
        with codecs.open(cache_path, mode='wb', encoding='utf-8') as f:
            json.dump(out, f)

    return out
Beispiel #5
0
def batch(data,
          fields=None,
          cachedir='../cache',
          mmjar=MMJAR_PATH,
          no_cache=False,
          long_concepts=True):
    """ batch process all the elements in data and cache
        them in a single file (reduces IO time).
        data is an list of dictionaries. If fields=None, then
        all the fields in every dictionary are cached;
        fields should be a list/tuple containing the relevant
        fields to consider.
    """

    if not no_cache:
        digest_data = ('batch_mm_{0}.cache'
                       ''.format(md5(json.dumps([fields, data])).hexdigest()))
        if not os.path.exists(cachedir):
            print >> sys.stderr, '[cache error] %s does not exists' % cachedir
            sys.exit(1)
        cache_path = os.path.join(cachedir, digest_data)
        if os.path.exists(cache_path):
            with codecs.open(cache_path, mode='rb', encoding='utf-8') as f:
                return json.load(f)

    out = []
    for elem in data:
        # if fields is not specified, then elem_fields == elem.keys()
        # otherwise list comprehension acts like a filter function
        elem_fields = [
            k for k in elem.keys() if (not (fields) or (k in fields))
        ]

        result = {
            fl: run(elem[fl],
                    no_cache=True,
                    mmjar=mmjar,
                    long_concepts=long_concepts)
            for fl in elem_fields
        }
        out.append(result)

    if not no_cache:
        out = prep_for_json(out)
        with codecs.open(cache_path, mode='wb', encoding='utf-8') as f:
            json.dump(out, f)

    return out
Beispiel #6
0
def batch(data, fields=None, cachedir='../cache',
          mmjar=MMJAR_PATH, no_cache=False,
          long_concepts=True):
    """ batch process all the elements in data and cache
        them in a single file (reduces IO time).
        data is an list of dictionaries. If fields=None, then
        all the fields in every dictionary are cached;
        fields should be a list/tuple containing the relevant
        fields to consider.
    """

    if not no_cache:
        digest_data = ('batch_mm_{0}.cache'
                       ''.format(md5(json.dumps([fields, data])).hexdigest()))
        if not os.path.exists(cachedir):
            print >> sys.stderr, '[cache error] %s does not exists' % cachedir
            sys.exit(1)
        cache_path = os.path.join(cachedir, digest_data)
        if os.path.exists(cache_path):
            with codecs.open(cache_path, mode='rb', encoding='utf-8') as f:
                return json.load(f)

    out = []
    for elem in data:
        # if fields is not specified, then elem_fields == elem.keys()
        # otherwise list comprehension acts like a filter function
        elem_fields = [k for k in elem.keys()
                       if (not(fields) or (k in fields))]

        result = {fl: run(elem[fl],
                          no_cache=True,
                          mmjar=mmjar,
                          long_concepts=long_concepts)
                  for fl in elem_fields}
        out.append(result)

    if not no_cache:
        out = prep_for_json(out)
        with codecs.open(cache_path, mode='wb', encoding='utf-8') as f:
            json.dump(out, f)

    return out
Beispiel #7
0
    def extract_nlp_batch(self, input_list):
        """
        Extract NLP information from `input_list`

        Returns a <dict> {`sentences<list>`: nlp info,
                          `corefs<list>`: coreference info}
                      `sentences` is a list of nlp info corresponding to entries in `input_list`
                       See method *parse* for more info
        """
        digest_data = 'nlptags_batch.cache_' + \
            md5(str(input_list).encode('ascii', 'ignore')).hexdigest()

        if not os.path.exists(self.cachedir):
            print >> sys.stderr, '[cache error] directory %s does not exists' % self.cachedir
            try:
                print '[cache info] Creating caches directory'
                os.makedirs(self.cachedir)
            except:
                print >> sys.stderr, '[cache error] Failed to create caches directory'
                sys.exit(1)
        cache_path = os.path.join(self.cachedir, digest_data)
        if os.path.exists(cache_path):
            with codecs.open(cache_path, mode='rb', encoding='utf-8') as f:
                return json.load(f)
        else:
            nlptags = []
            corefs = []
            for i in input_list:
                parsed = self.parse(i)
                nlptags.append(parsed['sentences'])
                corefs.append(parsed['coref'])
            res = prep_for_json(
                {'sentences': nlptags, 'corefs': corefs})
            with codecs.open(cache_path, mode='wb', encoding='utf-8') as f:
                json.dump(out, f)
            return res
Beispiel #8
0
def batch_filtered(data,
                   semtypes,
                   fields=None,
                   cachedir='../cache',
                   mmjar=MMJAR_PATH,
                   no_cache=False,
                   long_concepts=True):
    """ exactly like batch with the following difference:
        It returns only concepts and their semantic types
        and only for semantic types that are indicated in semtypes
        the output format is a list of concepts with only concept name
         and its semantic type
        default semtypes path is: data/semantic_types.json
    """

    if not no_cache:
        digest_data = ('batch_mm_filtered_{0}.cache'
                       ''.format(md5(json.dumps([fields, data])).hexdigest()))
        if not os.path.exists(cachedir):
            print >> sys.stderr, '[cache error] %s does not exists' % cachedir
            sys.exit(1)
        cache_path = os.path.join(cachedir, digest_data)
        if os.path.exists(cache_path):
            with codecs.open(cache_path, mode='rb', encoding='utf-8') as f:
                return json.load(f)

    out = []
    for elem in data:
        # if fields is not specified, then elem_fields == elem.keys()
        # otherwise list comprehension acts like a filter function
        elem_fields = [
            k for k in elem.keys() if (not (fields) or (k in fields))
        ]

        result = {
            fl: run(elem[fl],
                    no_cache=True,
                    mmjar=mmjar,
                    long_concepts=long_concepts)
            for fl in elem_fields
        }

        found_concepts = {}
        for fl in elem_fields:
            found_concepts[fl] = []
            for concept in result[fl]['txt']['concepts']:
                if str(concept['semtype'][0]) in semtypes:
                    found_concept = {}
                    found_concept['cname'] = str(concept['cname'])
                    for t in concept['semtype']:
                        found_concept['ctype'] = t
                    found_concepts[fl].append(found_concept)

        out.append(found_concepts)

    if not no_cache:
        out = prep_for_json(out)
        print out
        with codecs.open(cache_path, mode='wb', encoding='utf-8') as f:
            json.dump(out, f)

    return out