Esempio n. 1
0
def normalize_freebase_output(text):
    """Remove starting and ending quotes and the namespace prefix.

    :param text:
    :return:
    """
    if len(text) > 1 and text.startswith('"') and text.endswith('"'):
        text = text[1:-1]
    return globals.remove_freebase_ns(text)
Esempio n. 2
0
def normalize_freebase_output(text):
    """Remove starting and ending quotes and the namespace prefix.

    :param text:
    :return:
    """
    if len(text) > 1 and text.startswith('"') and text.endswith('"'):
        text = text[1:-1]
    return globals.remove_freebase_ns(text)
Esempio n. 3
0
 def build_index(self, index_file_prefix, facts_file):
     logger.info("Building new mediator index.")
     num_lines = 0
     vocabulary = {}
     entity_postings = {}
     # Read the vocabulary.
     logger.info("Building vocabulary.")
     vocab__words_set = set()
     with open(facts_file, 'r') as f:
         mm = mmap.mmap(f.fileno(), 0, prot=mmap.PROT_READ)
         line = mm.readline()
         while line:
             cols = line.strip().split('\t')
             if len(cols) != 4:
                 logger.warn("Invalid line: %s" % line)
                 line = mm.readline()
                 num_lines += 1
                 continue
             cols = [globals.remove_freebase_ns(x) for x in cols]
             vocab__words_set.update(cols)
             line = mm.readline()
             num_lines += 1
             if num_lines % 2000000 == 0:
                 logger.info("Processed %s lines." % num_lines)
     vocabulary_words = sorted(vocab__words_set)
     # This is only for fast reading.
     vocabulary = dict()
     for i, word in enumerate(vocabulary_words):
         vocabulary[word] = i
     # Second pass, this time with vocabulary.
     logger.info("Building index.")
     num_lines = 0
     with open(facts_file, 'r') as f:
         mm = mmap.mmap(f.fileno(), 0, prot=mmap.PROT_READ)
         line = mm.readline()
         while line:
             cols = line.strip().split('\t')
             if len(cols) != 4:
                 logger.warn("Invalid line: %s" % line)
                 line = mm.readline()
                 num_lines += 1
                 continue
             cols = [globals.remove_freebase_ns(x) for x in cols]
             value_id = vocabulary[cols[0]]
             relation_id = vocabulary[cols[1]]
             mediator_id = vocabulary[cols[3]]
             if value_id not in entity_postings:
                 entity_postings[value_id] = []
             entity_postings[value_id].append((mediator_id, relation_id))
             line = mm.readline()
             num_lines += 1
             if num_lines % 2000000 == 0:
                 logger.info("Processed %s lines." % num_lines)
     logger.info("Sorting postings...")
     for k, v in entity_postings.iteritems():
         a = sorted(entity_postings[k])
         # Remove the tuples
         a = [x for y in a for x in y]
         entity_postings[k] = np.array(a, dtype=np.uint32)
     total_postings = sum([len(x) for _, x in entity_postings.iteritems()])
     logger.info("Number of posting lists: %s " % len(entity_postings))
     logger.info("Avg. posting list length: %s " % (total_postings
                                                    / float(len(entity_postings))))
     logger.info("Writing index.")
     index_handle, offsets, sizes = write_index(index_file_prefix,
                                                vocabulary_words, entity_postings)
     self.vocabulary_words = vocabulary_words
     self.index = index_handle
     self.offsets = offsets
     self.sizes = sizes
 def build_index(self, index_file_prefix, facts_file):
     logger.info("Building new mediator index.")
     num_lines = 0
     vocabulary = {}
     entity_postings = {}
     # Read the vocabulary.
     logger.info("Building vocabulary.")
     vocab__words_set = set()
     with open(facts_file, 'r') as f:
         mm = mmap.mmap(f.fileno(), 0, prot=mmap.PROT_READ)
         line = mm.readline()
         while line:
             cols = line.strip().split('\t')
             if len(cols) != 4:
                 logger.warn("Invalid line: %s" % line)
                 line = mm.readline()
                 num_lines += 1
                 continue
             cols = [globals.remove_freebase_ns(x) for x in cols]
             vocab__words_set.update(cols)
             line = mm.readline()
             num_lines += 1
             if num_lines % 2000000 == 0:
                 logger.info("Processed %s lines." % num_lines)
     vocabulary_words = sorted(vocab__words_set)
     # This is only for fast reading.
     vocabulary = dict()
     for i, word in enumerate(vocabulary_words):
         vocabulary[word] = i
     # Second pass, this time with vocabulary.
     logger.info("Building index.")
     num_lines = 0
     with open(facts_file, 'r') as f:
         mm = mmap.mmap(f.fileno(), 0, prot=mmap.PROT_READ)
         line = mm.readline()
         while line:
             cols = line.strip().split('\t')
             if len(cols) != 4:
                 logger.warn("Invalid line: %s" % line)
                 line = mm.readline()
                 num_lines += 1
                 continue
             cols = [globals.remove_freebase_ns(x) for x in cols]
             value_id = vocabulary[cols[0]]
             relation_id = vocabulary[cols[1]]
             mediator_id = vocabulary[cols[3]]
             if value_id not in entity_postings:
                 entity_postings[value_id] = []
             entity_postings[value_id].append((mediator_id, relation_id))
             line = mm.readline()
             num_lines += 1
             if num_lines % 2000000 == 0:
                 logger.info("Processed %s lines." % num_lines)
     logger.info("Sorting postings...")
     for k, v in entity_postings.iteritems():
         a = sorted(entity_postings[k])
         # Remove the tuples
         a = [x for y in a for x in y]
         entity_postings[k] = np.array(a, dtype=np.uint32)
     total_postings = sum([len(x) for _, x in entity_postings.iteritems()])
     logger.info("Number of posting lists: %s " % len(entity_postings))
     logger.info("Avg. posting list length: %s " %
                 (total_postings / float(len(entity_postings))))
     logger.info("Writing index.")
     index_handle, offsets, sizes = write_index(index_file_prefix,
                                                vocabulary_words,
                                                entity_postings)
     self.vocabulary_words = vocabulary_words
     self.index = index_handle
     self.offsets = offsets
     self.sizes = sizes