Ejemplo n.º 1
0
def excel_to_corpus(excel_path, corpus_path):
    '''NB! Make sure to use .xls file extension for Excel files.'''
    corpus = PyCorpus(corpus_path)
    excel  = ExcelFile(excel_path)
    # as we do not know the number of sheets, we parse all of them
    # until we obtain a error
    idx = 0
    while True:
        try:
            df = excel.parse(str(idx))
            # recreate some information that was modified when exporting to xls
            new_df = dict()
            for col in df.columns:
                data = []
                for v in df[col]:
                    if type(v) == float and math.isnan(v):
                        data.append(None)
                    elif v == 0:
                        data.append(False)
                    elif v == 1:
                        data.append(True)
                    else:
                        data.append(v)
                new_df[col] = Series(data)
            corpus[str(idx)] = DataFrame(new_df)
        except xlrd.biffh.XLRDError:
            break
        idx += 1
    corpus.close()
Ejemplo n.º 2
0
def corpus_to_excel(corpus_path, excel_path):
    '''NB! Make sure to use .xls file extension for Excel files.'''
    corpus = PyCorpus(corpus_path)
    writer = ExcelWriter(excel_path)
    for key in corpus:
        corpus[key].to_excel(writer, sheet_name=key)
    writer.save()
    corpus.close()
Ejemplo n.º 3
0
def boi_to_t3corpus(orig_path, t3_path):
    '''Parse a t3 corpus, where documents are separated with -- '''
    f = codecs.open(orig_path, 'rb', 'utf-8')
    contents = f.read()
    f.close()
    docs = re.split('--\r?\n\r?\n', contents)
    corpus = PyCorpus(t3_path)
    for i, doc in enumerate(docs):
        corpus[str(i+1)] = parse_t3_doc_from_string(doc)
    corpus.close()
Ejemplo n.º 4
0
    def startup(self, startFresh=False):
        """
        Starts up my persistence engine.

        This method replaces the stock C{adj} adjacency list attribute with an
        instance of L{PersistentDict}, or for directed graphs, by replacing the
        C{succ} and C{pred} adjacency list attributes with two such instances.

        Important note regarding directed graph persistence
        ===================================================

        In directed graphs, we actually overwrite the C{adj} attribute instead
        of C{succ}. That's because C{adj} is used in the C{NX.Graph} superclass
        and C{succ} is merely set equal to it in the constructor of
        C{NX.DiGraph}.

        When C{x = y} in Python, changing some property of C{y} causes the same
        change in C{x}. However, we are changing the attribute C{adj} to
        reference an I{entirely new} object, not just changing its
        properties. Thus we have to refresh the C{self.succ = self.adj} link
        after overwriting C{adj}, giving C{succ} a reference to the I{new}
        C{adj} object. Adding or changing I{items} of either one will show up
        in the other, so no further hacking is required.

        @param startFresh: Set this keyword C{True} to clear any
          persisted content and start fresh. This keyword can also be
          set in the constructor. Obviously, you should use this
          option with care as it will B{erase} database entries!

        @return: A deferred that fires when the persistence engine is ready
          for use.
        
        """
        self._uniqueCount = 0

        def ID():
            self._uniqueCount += 1
            thisID = "%s-%d" % (self.name, self._uniqueCount)
            return hash(thisID)

        def started(null):
            self.succ = self.adj
            if startFresh or self.startFresh:
                return self.adjacencyListOperation("clear")

        dList = []
        url, kw = self.engineParams
        kw['nameType'] = self.nodeType
        # Adjacency lists
        for dictName in self.adjacencyLists:
            dictObject = PersistentDict(ID(), url, **kw)
            setattr(self, dictName, dictObject)
            dList.append(dictObject.preload())
        return defer.DeferredList(dList).addCallback(started)
Ejemplo n.º 5
0
    def startup(self, startFresh=False):
        """
        Starts up my persistence engine.

        This method replaces the stock C{adj} adjacency list attribute with an
        instance of L{PersistentDict}, or for directed graphs, by replacing the
        C{succ} and C{pred} adjacency list attributes with two such instances.

        Important note regarding directed graph persistence
        ===================================================

        In directed graphs, we actually overwrite the C{adj} attribute instead
        of C{succ}. That's because C{adj} is used in the C{NX.Graph} superclass
        and C{succ} is merely set equal to it in the constructor of
        C{NX.DiGraph}.

        When C{x = y} in Python, changing some property of C{y} causes the same
        change in C{x}. However, we are changing the attribute C{adj} to
        reference an I{entirely new} object, not just changing its
        properties. Thus we have to refresh the C{self.succ = self.adj} link
        after overwriting C{adj}, giving C{succ} a reference to the I{new}
        C{adj} object. Adding or changing I{items} of either one will show up
        in the other, so no further hacking is required.

        @param startFresh: Set this keyword C{True} to clear any
          persisted content and start fresh. This keyword can also be
          set in the constructor. Obviously, you should use this
          option with care as it will B{erase} database entries!

        @return: A deferred that fires when the persistence engine is ready
          for use.
        
        """
        self._uniqueCount = 0
        
        def ID():
            self._uniqueCount += 1
            thisID = "%s-%d" % (self.name, self._uniqueCount)
            return hash(thisID)

        def started(null):
            self.succ = self.adj
            if startFresh or self.startFresh:
                return self.adjacencyListOperation("clear")

        dList = []
        url, kw = self.engineParams
        kw['nameType'] = self.nodeType
        # Adjacency lists
        for dictName in self.adjacencyLists:
            dictObject = PersistentDict(ID(), url, **kw)
            setattr(self, dictName, dictObject)
            dList.append(dictObject.preload())
        return defer.DeferredList(dList).addCallback(started)
Ejemplo n.º 6
0
Archivo: ner.py Proyecto: estnltk/pfe
def crf_model_predict(model_path, corpus, target_path, series_name):
    f = open(model_path, 'rb')
    model, kwargs = cPickle.load(f)
    f.close()
    s = Corpus(target_path)
    for doc_id, predictions in crf_predict(model, corpus, **kwargs):
        doc = corpus[doc_id]
        doc[series_name] = predictions
        s[doc_id] = doc
        sys.stderr.write('Document {0} classified.\n'.format(doc_id))
    s.close()
Ejemplo n.º 7
0
def parse_wikipedia(path, corpus_path):
    '''path - the directory containing the extracted documents by
       WikiExtractor.py.
       corpus_path - the filename to store the parsed corpus.
    '''
    corpus = PyCorpus(corpus_path)

    def from_path(path):
        sys.stderr.write('Processing path ' + path + '\n')
        files = os.listdir(path)
        for f in files:
            newpath = os.path.join(path, f)
            if os.path.isdir(newpath):
                from_path(newpath)
            else:
                sys.stderr.write('Processing file ' + newpath + '\n')
                get_documents(newpath)

    def get_documents(path):
        f = codecs.open(path, 'r', 'utf-8')
        contents = f.read()
        f.close()
        doctexts  = contents.split('<doc id="')
        documents = []

        for text in doctexts:
            text = text.strip()
            if len(text) < 1:
                continue
            # extract the document parts
            doc_id = int(text[:text.index('"')])
            title  = text[text.index('title="')+7 : text.index('"', text.index('title="')+7)]
            text   = text[text.index('\n') : text.index('</doc>')].strip()

            text_stream = cStringIO.StringIO(text.encode('utf-8'))
            utf8_stream = codecs.getreader('utf-8')(text_stream)
            corpus[str(doc_id)] = parse_plain_doc_from_stream(utf8_stream)
        return documents

    from_path(path)

    corpus.sync()
    return corpus
Ejemplo n.º 8
0
def parse_plain_corpus(plainpath, corpuspath):
    corpus = PyCorpus(corpuspath)
    data = codecs.open(plainpath, 'rb', 'utf-8').read()
    docs = re.split('s*?\r?\n\r?\n', data)
    data = None
    corpus.autocommit(False)
    for doc in docs:
        lines = re.split('\r?\n', doc.strip())
        title = lines[0].strip()
        contents = '\n'.join(lines[1:]).strip()
        text_stream = cStringIO.StringIO(contents.encode('utf-8'))
        utf8_stream = codecs.getreader('utf-8')(text_stream)
        corpus[title] = parse_plain_doc_from_stream(utf8_stream)
    corpus.commit()
    corpus.close()
Ejemplo n.º 9
0
 def __getitem__(self, key):
     if key in self.subkeys:
         return PyCorpus.__getitem__(self, key)
     raise KeyError()
Ejemplo n.º 10
0
 def __init__(self, *args, **kwargs):
     self.subkeys = []
     if 'keys' in kwargs:
         self.subkeys = set(kwargs['keys'])
         del kwargs['keys']
     PyCorpus.__init__(*args, **kwargs)
Ejemplo n.º 11
0
def as_treetagger_corpus(orig_path, dest_path, encoding='latin-1', language='english'):
    assert (orig_path != eng_path)
    orig = PyCorpus(orig_path)
    dest = PyCorpus(eng_path)
    dest.autocommit(False)
    for doc_id in orig.keys():
        dest[doc_id] = as_treetagger_doc(orig[doc_id], encoding=encoding, language=language)
    dest.commit()
    orig.close()
    dest.close()
Ejemplo n.º 12
0
def as_eng_postagged_corpus(orig_path, eng_path):
    '''Uses nltk default tagger.'''
    assert (orig_path != eng_path)
    orig = PyCorpus(orig_path)
    dest = PyCorpus(eng_path)
    dest.autocommit(False)
    for doc_id in orig.keys():
        dest[doc_id] = as_eng_postagged_doc(orig[doc_id])
    dest.commit()
    orig.close()
    dest.close()
Ejemplo n.º 13
0
def as_t3corpus(orig_path, t3_path):
    '''Convert a corpus at orig_path to t3mesta corpus to t3_path.'''
    orig_corpus = PyCorpus(orig_path)
    dest_corpus = PyCorpus(t3_path)
    dest_corpus.autocommit(False)

    dest_keys = set(dest_corpus.keys())
    for key in orig_corpus.keys():
        if key not in dest_keys:
            dest_corpus[key] = as_t3doc(orig_corpus[key])

    dest_corpus.commit()

    orig_corpus.close()
    dest_corpus.close()
Ejemplo n.º 14
0
Archivo: ner.py Proyecto: estnltk/pfe
def crf_model_predict_mc(model_path, corpus, target_path, series_name, n):
    '''Multi core version of crf_model_predict.
       n - number of processes to use.
    '''
    sys.stderr.write('Dividing documents between {0} processes.\n'.format(n))
    doc_ids  = list(corpus.keys())
    id_lists = [[] for _ in range(n)]
    idx = 0
    for doc_id in corpus.keys():
        id_lists[idx].append(doc_id)
        idx += 1
        if idx >= n:
            idx = 0
    sys.stderr.write('Launching processes.\n')
    dest_names   = []
    processes    = []
    for idx, ids in enumerate(id_lists):
        if len(ids) > 0:
            folder = tempfile.mkdtemp()
            # write the new corpus
            src_name  = os.path.join(folder, 'src.corpus')
            dest_name = os.path.join(folder, 'dest.corpus')
            tmp_corp = Corpus(src_name)
            tmp_corp.autocommit(False)
            for doc_id in ids:
                tmp_corp[doc_id] = corpus[doc_id]
            tmp_corp.close()

            # start the process
            process = Process(target=crf_process,
                                args=(model_path,
                                     src_name,
                                     dest_name,
                                     series_name))
            process.start()
            sys.stderr.write('Process {0} launched\n'.format(idx))
            # store the identificators
            dest_names.append(dest_name)
            processes.append(process)
    for p in processes:
        p.join()
    sys.stderr.write('Processes finished!\n')

    # concatenate temporary outputs
    target_corp = Corpus(target_path)
    target_corp.autocommit(False)
    for dest_name in dest_names:
        tmp_corp = Corpus(dest_name)
        for doc_id in tmp_corp:
            target_corp[doc_id] = tmp_corp[doc_id]
        tmp_corp.close()
    target_corp.close()
    sys.stderr.write('Corpus {0} created'.format(target_path))
Ejemplo n.º 15
0
Archivo: ner.py Proyecto: estnltk/pfe
def crf_process(model_path, tmp_corpus_path, tmp_target_path, series_name):
    tmp_corp = Corpus(tmp_corpus_path)
    crf_model_predict(model_path, tmp_corp, tmp_target_path, series_name)
    tmp_corp.close()