コード例 #1
0
def main():

    #''' Accessing Folder'''
    dirpath = str(
        sys.argv[1]
    )  #sys.argv[0] is the name of the python program, sys.arg[1] is the directory path
    folder = nltk.data.find(dirpath)

    #''' Reading Corpus files '''
    corpus = TaggedCorpusReader(folder, '.*\.prd')

    #''' Extracting sentences in Corpus files '''
    corpusSents = corpus.sents()

    #''' Splitting notes & Combining elements '''
    corpusElems = []
    for corpusSent in corpusSents:
        for elem in corpusSent:
            corpusElems.append(elem)

    solution = TallySolution(corpusElems)
    solution.countS()
    solution.countNP()
    solution.countVP()
    solution.countDVP()
    solution.countIVP()
コード例 #2
0
 def __init__(self, root, items, encoding="utf8"):
     gaps_re = r"(?u)(?<=/\.)\s+|\s*\S*_CODE\s*|\s*\S*_ID\s*"
     sent_tokenizer = RegexpTokenizer(gaps_re, gaps=True)
     TaggedCorpusReader.__init__(self,
                                 root,
                                 items,
                                 sep="_",
                                 sent_tokenizer=sent_tokenizer)
コード例 #3
0
 def __init__(self, root, items, encoding=None):
     gaps_re = r'(?u)(?<=/\.)\s+|\s*\S*_CODE\s*|\s*\S*_ID\s*'
     sent_tokenizer = RegexpTokenizer(gaps_re, gaps=True)
     TaggedCorpusReader.__init__(self,
                                 root,
                                 items,
                                 sep='_',
                                 sent_tokenizer=sent_tokenizer)
コード例 #4
0
    def __init__(self, poem_title):
        """
        >>> pel_reader = PoeticEddaLemmatizationReader("Völuspá")

        :param poem_title:
        """
        assert poem_title in poetic_edda_titles
        TaggedCorpusReader.__init__(
            self,
            os.path.join(poetic_edda, poem_title, "txt_files",
                         "lemmatization"), "lemmatized.txt")
コード例 #5
0
ファイル: tp2.py プロジェクト: MrBoas/PLC
def main():
    ops, args = getopt.getopt(sys.argv[1:], 'b')
    ops = dict(ops)

    if '-b' in ops:
        corpus = TaggedCorpusReader('tagged/', r'.*\.tagged')
        tagged_sents_m = corpus.tagged_sents()
        m0 = nltk.DefaultTagger('N')
        m1 = nltk.UnigramTagger(tagged_sents_m, backoff=m0)
        m2 = nltk.BigramTagger(tagged_sents_m, backoff=m1)
        m3 = nltk.TrigramTagger(tagged_sents_m, backoff=m2)

        os.makedirs(dir, exist_ok=True)  # cria a diretoria
        output_file = open(corpus_path, 'wb')
        dump(m3, output_file, -1)
        output_file.close()
    else:
        # load do corpus
        corpus_input = open(corpus_path, 'rb')
        tagger_corpus = load(corpus_input)
        corpus_input.close()
        # load do input
        file_path = sys.argv[1]
        file_input = open(file_path, 'r')
        file_lines = file_input.readlines()
        print("### LOAD DONE ###")  # debug

        triplos = []
        for i in range(int(len(file_lines))):
            if file_lines[i] != '\n':  # process non empty lines
                triplos = processLine(file_lines[i], tagger_corpus, triplos)
        triplos.sort(key=sortTriplos)
        # triplos = remTriplosLastN(3,triplos)
        print(triplos)

        nodes = get_nodes(triplos)
        edgesW = triplos
        draw(nodes, edgesW)
コード例 #6
0
    def __init__(self, poem_title, _type=None):
        """
        >>> pel_reader = PoeticEddaLemmatizationReader("Völuspá")

        :param poem_title:
        """
        assert poem_title in poetic_edda_titles
        if _type == "tei":
            TaggedCorpusReader.__init__(self, os.path.join(CORPUS_PATH, poetic_edda, poem_title, "txt_files",
                                                           "lemmatization"),
                                        "tei_lemmatized_complete.txt")
        elif _type == "test":
            TaggedCorpusReader.__init__(self, os.path.join(CORPUS_PATH, poetic_edda, poem_title, "txt_files",
                                                           "lemmatization"),
                                        "test_lemmatized_complete.txt")
        else:
            TaggedCorpusReader.__init__(self, os.path.join(CORPUS_PATH, poetic_edda, poem_title, "txt_files",
                                                       "lemmatization"),
                                        "lemmatized.txt")
コード例 #7
0
from nltk.corpus.reader.tagged import TaggedCorpusReader
import nltk
from helpers.extensions import sort_dict_by_value, keys_by_value


# Lower cases the files and saves it to another location
# lower_files('docs\\brown_hw\\Train\\', 'docs\\brown_hw_lowercase\\Train\\')
# lower_files('docs\\brown_hw\\Test\\', 'docs\\brown_hw_lowercase\\Test\\')

# load_multiple_corpus_files('docs\\brown_hw\\Train\\')
# load_multiple_corpus_files('docs\\brown_hw\\Test\\')


# TaggedCorpusReader for train set
train_root = 'docs\\brown_hw_lowercase\\Train'
train_reader = TaggedCorpusReader(train_root, '.*')

train_words = train_reader.words()

train_word_counts = nltk.FreqDist(train_words)

# Words of train set that occurs only once
train_word_counts_1 = keys_by_value(train_word_counts)

# Words with tags of train set
tagged_words_with_unk = [list(x) for x in train_reader.tagged_words()]

# Makes words 'UNK' whose counts are one
for index, tagged_word in enumerate(tagged_words_with_unk):
    if tagged_word[0] in train_word_counts_1.keys():
        tagged_words_with_unk[index][0] = 'UNK'
コード例 #8
0
 if alreadyDone:
     continue
 files = cityFileSet[city]
 currentCity += 1
 trans = []
 wordGroupDict = dict()
 print '\n'+city+'\n'
 
 currentProgress(currentCity, totalCities, 'cities')
 totalReviews = 0
 
 for file in files:
     corpusdir = corpus_path+file[0:-5]+'/'
     if not os.path.isdir(corpusdir):
         missingCorpus(corpusdir)
     hotelcorpus = TaggedCorpusReader(corpusdir, '.*')
     
     stopset = getDefaultStopset(set(hotelcorpus.words("stopset.txt")))
     
     for review in hotelcorpus.fileids():
         if review == "stopset.txt":
             continue
         content = hotelcorpus.tagged_sents(review)
         if len(content) == 0:
             continue
         totalReviews += 1
         trimmedTokens = []
         for sentences in content:            
             for word, pos in sentences:
                 if word.isalpha() and word.lower() not in stopset:
                     trimmedTokens += [(word.lower(), pos)]
コード例 #9
0
ファイル: ycoe.py プロジェクト: 52nlp/Text-Summarization
 def __init__(self, root, items, encoding='utf8'):
     gaps_re = r'(?u)(?<=/\.)\s+|\s*\S*_CODE\s*|\s*\S*_ID\s*'
     sent_tokenizer = RegexpTokenizer(gaps_re, gaps=True)
     TaggedCorpusReader.__init__(self, root, items, sep='_',
                                 sent_tokenizer=sent_tokenizer)
コード例 #10
0
   import nltk

# Corpus texto simples
from nltk.corpus import PlaintextCorpusReader
loc = '/Users/rmoura/nltk_data/corpora/rai/textoSimples/'
corpus1 = PlaintextCorpusReader(loc, '.*\.txt')
print(corpus1.fileids())
print(corpus1.sents())
print(corpus1.words())

# Corpus texto etiquetado
from nltk.corpus.reader.tagged import TaggedCorpusReader
loc = '/Users/rmoura/nltk_data/corpora/rai/textoEtiquetas/'
corpus2 = TaggedCorpusReader(loc, '.*\.txt')
print(corpus2.fileids())
print(corpus2.words())
print("Palavras etiquetadas: ", corpus2.tagged_words())
print(corpus2.tagged_words('003.txt'))
print("Sentencas diretas:")
for s in corpus2.sents():
    print(' '.join(s))

from nltk.corpus.reader import CategorizedPlaintextCorpusReader
loc = '/Users/rmoura/nltk_data/corpora/rai/textoCategorias/'
corpus3 = CategorizedPlaintextCorpusReader(loc, '.*\.txt', cat_file="categorias.txt")
print(corpus3.fileids())
print(corpus3.categories())
print(corpus3.words(categories='brasnam'))

# Definicao de stopwords
stopwords = nltk.corpus.stopwords.words('portuguese')
コード例 #11
0
ファイル: oanc_process.py プロジェクト: dbarbella/analogy
    '''
    Use glob to make a list of all of the .txt files in the directory of interest, recursively.
    :param root_directory:
    :return:
    '''
    file_list = glob.glob(root_directory + "/**/*.txt", recursive=True)
    return file_list


# Use the ANC tool to build an nltk version of the data here.
oanc_directory = root + "\\corpora\\oanc\\nltk-data\\travel_guides"  # oanc/nltk-data"
oanc_files = build_file_list(oanc_directory)

# See http://www.nltk.org/howto/corpus.html
oanc_corpus = TaggedCorpusReader(
    oanc_directory, oanc_files,
    sep="_")  # Specify that _ is used as a separator.
print(oanc_corpus.fileids())
x = oanc_corpus.words()[:50]
print(x)
y = oanc_corpus.paras()[:10]
"""
This script is an alternative/demo to scrapeWordHunt.py, but is not
used in this folder.
"""
SOURCE_NAME = "OANC-TRAV"

txt_file_name = "analogy_sentences_OANC-TRAV.txt"
csv_file_name = "analogy_names_OANC-TRAV.csv"
output_handler = open(root + "\\corpora\\extractions\\" + txt_file_name,
                      "w",
コード例 #12
0
 def __init__(self, poem_title):
     TaggedCorpusReader.__init__(
         self,
         os.path.join(poetic_edda, poem_title, "txt_files", "syllabified"),
         "syllabified.txt")
コード例 #13
0
 def __init__(self, poem_title):
     assert poem_title in poetic_edda_titles
     TaggedCorpusReader.__init__(
         self, os.path.join(poetic_edda, poem_title, "txt_files", "pos"),
         "pos_tagged.txt")
コード例 #14
0
    print(str(corpus).replace('\\\\', '/'))
    print('  ', repr(corpus.fileids())[:60])
    print('  ', repr(corpus.words()[:10])[:60])
root = make_testcorpus(a="""
    This/det is/verb the/det first/adj sentence/noun ./punc
    Here/det  is/verb  another/adj    sentence/noun ./punc
    Note/verb that/comp you/pron can/verb use/verb
    any/noun tag/noun set/noun

    This/det is/verb the/det second/adj paragraph/noun ./punc
    word/n without/adj a/det tag/noun :/: hello ./punc
    """,
                       b="""
    This/det is/verb the/det second/adj file/noun ./punc
    """)
corpus = TaggedCorpusReader(root, list('ab'))
print(corpus.fileids())
print(str(corpus.root) == str(root))
print(corpus.words())
print(corpus.sents())  # doctest: +ELLIPSIS
print(corpus.paras())  # doctest: +ELLIPSIS
print(corpus.tagged_words())  # doctest: +ELLIPSIS
print(corpus.tagged_sents())  # doctest: +ELLIPSIS
print(corpus.tagged_paras())  # doctest: +ELLIPSIS
print(corpus.raw()[:40])
print(len(corpus.words()), [len(corpus.words(d)) for d in corpus.fileids()])
print(len(corpus.sents()), [len(corpus.sents(d)) for d in corpus.fileids()])
print(len(corpus.paras()), [len(corpus.paras(d)) for d in corpus.fileids()])
print(corpus.words('a'))
print(corpus.words('b'))
# del_testcorpus(root)
コード例 #15
0
ファイル: init_masc_ne.py プロジェクト: JakeBrawer/misc
import nltk
from nltk.corpus import ConllChunkCorpusReader

from nltk.corpus.reader.tagged import TaggedCorpusReader
root = '/usr/local/share/nltk_data/corpora/MASC-for-NE/'
masc_for_ne = TaggedCorpusReader(root,'.*', '_')

sents = masc_for_ne.tagged_sents()
ne_sents = [nltk.ne_chunk(sent) for sent in sents]

root = "/usr/local/share/nltk_data/corpora/masc_conll/"
gold_corpus = ConllChunkCorpusReader(root,r".*\.conll", chunk_types=("DATE","PERSON","ORGANIZATION","LOCATION"))
gold_sents = gold_corpus.chunked_sents()