コード例 #1
0
ファイル: ch02.py プロジェクト: 447327642/nltk-examples
def describe(corpus):
  print "\t".join(["c/w", "w/s", "w/v", "id"])
  for fileid in corpus.fileids():
    nchars = len(corpus.raw(fileid))
    nwords = len(corpus.words(fileid))
    nsents = len(corpus.sents(fileid))
    nvocab = len(set([w.lower() for w in corpus.words(fileid)]))
    print "\t".join([str(nchars/nwords), str(nwords/nsents),
      str(nwords/nvocab), fileid])
コード例 #2
0
ファイル: ch02.py プロジェクト: zeuscaesar/nltk-examples
def describe(corpus):
    print "\t".join(["c/w", "w/s", "w/v", "id"])
    for fileid in corpus.fileids():
        nchars = len(corpus.raw(fileid))
        nwords = len(corpus.words(fileid))
        nsents = len(corpus.sents(fileid))
        nvocab = len(set([w.lower() for w in corpus.words(fileid)]))
        print "\t".join([
            str(nchars / nwords),
            str(nwords / nsents),
            str(nwords / nvocab), fileid
        ])
コード例 #3
0
def create_dfs(corpus):
    print("Gathering data..")
    hold_files = corpus.fileids()
    rowlist = []
    for each in hold_files:
        each_row = {}
        each_row['Year'], each_row['Last_name'], _ = each.replace(
            '-', '.').split('.')
        each_row['Text'] = pre_process(
            corpus.raw(each))  # Preprocessed text file
        rowlist.append(each_row)
    print("Creating dataframe..")
    df = pd.DataFrame(rowlist)
    df['Year'] = df['Year'].astype(int)
    tf_idf_df = get_tfidf(df)

    return tf_idf_df, df
コード例 #4
0
ファイル: autotext.py プロジェクト: jorgegus/autotext
 def _read_corpus(self, corpus, path):
     #Lists for sklearn
     documents = []
     targets = []
     j = 0
     #print('Reading files')
     try:
         for cat in corpus.categories():
             for doc in corpus.fileids(cat):
                 documents.append(corpus.raw(doc))
                 targets.append(j)
             j += 1
     except:
         j = 0
         for cat in corpus.categories():
             for doc in corpus.fileids(cat):
                 raw_document = open(path + doc, errors='ignore')
                 documents.append(raw_document.read())
                 targets.append(j)
                 raw_document.close()
             j += 1
     return documents, targets
コード例 #5
0
corpusname = "inaugural"
if len(sys.argv) >= 2:
    corpusname = sys.argv[1]

filelim = 4
if len(sys.argv) >= 3:
    filelim = int(sys.argv[2])

corpus = getattr(nltk.corpus, corpusname)


def mkdir_p(path):
    try:
        os.makedirs(path)
    except OSError as exc:
        if exc.errno == errno.EEXIST and os.path.isdir(path):
            pass
        else:
            raise


path = "./%s" % corpusname
mkdir_p(path)

for i in range(0, filelim):
    fid = corpus.fileids()[i]
    with open("%s/%s" % (path, fid), 'w') as out:
        # need to remove new lines here so MR interprets each file
        # as a single input
        out.write(corpus.raw(fid).replace('\n', ' '))
コード例 #6
0
corpusname = "inaugural"
if len(sys.argv) >= 2:
    corpusname = sys.argv[1]

filelim = 4
if len(sys.argv) >= 3:
    filelim = int(sys.argv[2])

corpus = getattr(nltk.corpus, corpusname)


def mkdir_p(path):
    try:
        os.makedirs(path)
    except OSError as exc:
        if exc.errno == errno.EEXIST and os.path.isdir(path):
            pass
        else:
            raise

path = "./%s" % corpusname
mkdir_p(path)


for i in range(0, filelim):
    fid = corpus.fileids()[i]
    with open("%s/%s" % (path, fid), 'w') as out:
        # need to remove new lines here so MR interprets each file
        # as a single input
        out.write(corpus.raw(fid).replace('\n', ' '))
コード例 #7
0
root = make_testcorpus(ext='.txt',
                       a="""
                       This is the first sentence.  Here is another
                       sentence!  And here's a third sentence.

                       This is the second paragraph.  Tokenization is currently
                       fairly simple, so the period in Mr. gets tokenized.
                       """,
                       b="""This is the second file.""")
corpus = PlaintextCorpusReader(root, ['a.txt', 'b.txt'])
print(corpus.fileids())
corpus = PlaintextCorpusReader(root, '.*\.txt')
print(corpus.fileids())
print(str(corpus.root) == str(root))
print(corpus.words())
print(corpus.raw()[:40])
print(len(corpus.words()), [len(corpus.words(d)) for d in corpus.fileids()])
print(corpus.words('a.txt'))
print(corpus.words('b.txt'))
print(corpus.words()[:4], corpus.words()[-4:])
# del_testcorpus(root)
for corpus in (abc, genesis, inaugural, state_union, webtext):
    print(str(corpus).replace('\\\\', '/'))
    print('  ', repr(corpus.fileids())[:60])
    print('  ', repr(corpus.words()[:10])[:60])
root = make_testcorpus(a="""
    This/det is/verb the/det first/adj sentence/noun ./punc
    Here/det  is/verb  another/adj    sentence/noun ./punc
    Note/verb that/comp you/pron can/verb use/verb
    any/noun tag/noun set/noun