def main(): #''' Accessing Folder''' dirpath = str( sys.argv[1] ) #sys.argv[0] is the name of the python program, sys.arg[1] is the directory path folder = nltk.data.find(dirpath) #''' Reading Corpus files ''' corpus = TaggedCorpusReader(folder, '.*\.prd') #''' Extracting sentences in Corpus files ''' corpusSents = corpus.sents() #''' Splitting notes & Combining elements ''' corpusElems = [] for corpusSent in corpusSents: for elem in corpusSent: corpusElems.append(elem) solution = TallySolution(corpusElems) solution.countS() solution.countNP() solution.countVP() solution.countDVP() solution.countIVP()
def __init__(self, root, items, encoding="utf8"): gaps_re = r"(?u)(?<=/\.)\s+|\s*\S*_CODE\s*|\s*\S*_ID\s*" sent_tokenizer = RegexpTokenizer(gaps_re, gaps=True) TaggedCorpusReader.__init__(self, root, items, sep="_", sent_tokenizer=sent_tokenizer)
def __init__(self, root, items, encoding=None): gaps_re = r'(?u)(?<=/\.)\s+|\s*\S*_CODE\s*|\s*\S*_ID\s*' sent_tokenizer = RegexpTokenizer(gaps_re, gaps=True) TaggedCorpusReader.__init__(self, root, items, sep='_', sent_tokenizer=sent_tokenizer)
def __init__(self, poem_title): """ >>> pel_reader = PoeticEddaLemmatizationReader("Völuspá") :param poem_title: """ assert poem_title in poetic_edda_titles TaggedCorpusReader.__init__( self, os.path.join(poetic_edda, poem_title, "txt_files", "lemmatization"), "lemmatized.txt")
def main(): ops, args = getopt.getopt(sys.argv[1:], 'b') ops = dict(ops) if '-b' in ops: corpus = TaggedCorpusReader('tagged/', r'.*\.tagged') tagged_sents_m = corpus.tagged_sents() m0 = nltk.DefaultTagger('N') m1 = nltk.UnigramTagger(tagged_sents_m, backoff=m0) m2 = nltk.BigramTagger(tagged_sents_m, backoff=m1) m3 = nltk.TrigramTagger(tagged_sents_m, backoff=m2) os.makedirs(dir, exist_ok=True) # cria a diretoria output_file = open(corpus_path, 'wb') dump(m3, output_file, -1) output_file.close() else: # load do corpus corpus_input = open(corpus_path, 'rb') tagger_corpus = load(corpus_input) corpus_input.close() # load do input file_path = sys.argv[1] file_input = open(file_path, 'r') file_lines = file_input.readlines() print("### LOAD DONE ###") # debug triplos = [] for i in range(int(len(file_lines))): if file_lines[i] != '\n': # process non empty lines triplos = processLine(file_lines[i], tagger_corpus, triplos) triplos.sort(key=sortTriplos) # triplos = remTriplosLastN(3,triplos) print(triplos) nodes = get_nodes(triplos) edgesW = triplos draw(nodes, edgesW)
def __init__(self, poem_title, _type=None): """ >>> pel_reader = PoeticEddaLemmatizationReader("Völuspá") :param poem_title: """ assert poem_title in poetic_edda_titles if _type == "tei": TaggedCorpusReader.__init__(self, os.path.join(CORPUS_PATH, poetic_edda, poem_title, "txt_files", "lemmatization"), "tei_lemmatized_complete.txt") elif _type == "test": TaggedCorpusReader.__init__(self, os.path.join(CORPUS_PATH, poetic_edda, poem_title, "txt_files", "lemmatization"), "test_lemmatized_complete.txt") else: TaggedCorpusReader.__init__(self, os.path.join(CORPUS_PATH, poetic_edda, poem_title, "txt_files", "lemmatization"), "lemmatized.txt")
from nltk.corpus.reader.tagged import TaggedCorpusReader import nltk from helpers.extensions import sort_dict_by_value, keys_by_value # Lower cases the files and saves it to another location # lower_files('docs\\brown_hw\\Train\\', 'docs\\brown_hw_lowercase\\Train\\') # lower_files('docs\\brown_hw\\Test\\', 'docs\\brown_hw_lowercase\\Test\\') # load_multiple_corpus_files('docs\\brown_hw\\Train\\') # load_multiple_corpus_files('docs\\brown_hw\\Test\\') # TaggedCorpusReader for train set train_root = 'docs\\brown_hw_lowercase\\Train' train_reader = TaggedCorpusReader(train_root, '.*') train_words = train_reader.words() train_word_counts = nltk.FreqDist(train_words) # Words of train set that occurs only once train_word_counts_1 = keys_by_value(train_word_counts) # Words with tags of train set tagged_words_with_unk = [list(x) for x in train_reader.tagged_words()] # Makes words 'UNK' whose counts are one for index, tagged_word in enumerate(tagged_words_with_unk): if tagged_word[0] in train_word_counts_1.keys(): tagged_words_with_unk[index][0] = 'UNK'
if alreadyDone: continue files = cityFileSet[city] currentCity += 1 trans = [] wordGroupDict = dict() print '\n'+city+'\n' currentProgress(currentCity, totalCities, 'cities') totalReviews = 0 for file in files: corpusdir = corpus_path+file[0:-5]+'/' if not os.path.isdir(corpusdir): missingCorpus(corpusdir) hotelcorpus = TaggedCorpusReader(corpusdir, '.*') stopset = getDefaultStopset(set(hotelcorpus.words("stopset.txt"))) for review in hotelcorpus.fileids(): if review == "stopset.txt": continue content = hotelcorpus.tagged_sents(review) if len(content) == 0: continue totalReviews += 1 trimmedTokens = [] for sentences in content: for word, pos in sentences: if word.isalpha() and word.lower() not in stopset: trimmedTokens += [(word.lower(), pos)]
def __init__(self, root, items, encoding='utf8'): gaps_re = r'(?u)(?<=/\.)\s+|\s*\S*_CODE\s*|\s*\S*_ID\s*' sent_tokenizer = RegexpTokenizer(gaps_re, gaps=True) TaggedCorpusReader.__init__(self, root, items, sep='_', sent_tokenizer=sent_tokenizer)
import nltk # Corpus texto simples from nltk.corpus import PlaintextCorpusReader loc = '/Users/rmoura/nltk_data/corpora/rai/textoSimples/' corpus1 = PlaintextCorpusReader(loc, '.*\.txt') print(corpus1.fileids()) print(corpus1.sents()) print(corpus1.words()) # Corpus texto etiquetado from nltk.corpus.reader.tagged import TaggedCorpusReader loc = '/Users/rmoura/nltk_data/corpora/rai/textoEtiquetas/' corpus2 = TaggedCorpusReader(loc, '.*\.txt') print(corpus2.fileids()) print(corpus2.words()) print("Palavras etiquetadas: ", corpus2.tagged_words()) print(corpus2.tagged_words('003.txt')) print("Sentencas diretas:") for s in corpus2.sents(): print(' '.join(s)) from nltk.corpus.reader import CategorizedPlaintextCorpusReader loc = '/Users/rmoura/nltk_data/corpora/rai/textoCategorias/' corpus3 = CategorizedPlaintextCorpusReader(loc, '.*\.txt', cat_file="categorias.txt") print(corpus3.fileids()) print(corpus3.categories()) print(corpus3.words(categories='brasnam')) # Definicao de stopwords stopwords = nltk.corpus.stopwords.words('portuguese')
''' Use glob to make a list of all of the .txt files in the directory of interest, recursively. :param root_directory: :return: ''' file_list = glob.glob(root_directory + "/**/*.txt", recursive=True) return file_list # Use the ANC tool to build an nltk version of the data here. oanc_directory = root + "\\corpora\\oanc\\nltk-data\\travel_guides" # oanc/nltk-data" oanc_files = build_file_list(oanc_directory) # See http://www.nltk.org/howto/corpus.html oanc_corpus = TaggedCorpusReader( oanc_directory, oanc_files, sep="_") # Specify that _ is used as a separator. print(oanc_corpus.fileids()) x = oanc_corpus.words()[:50] print(x) y = oanc_corpus.paras()[:10] """ This script is an alternative/demo to scrapeWordHunt.py, but is not used in this folder. """ SOURCE_NAME = "OANC-TRAV" txt_file_name = "analogy_sentences_OANC-TRAV.txt" csv_file_name = "analogy_names_OANC-TRAV.csv" output_handler = open(root + "\\corpora\\extractions\\" + txt_file_name, "w",
def __init__(self, poem_title): TaggedCorpusReader.__init__( self, os.path.join(poetic_edda, poem_title, "txt_files", "syllabified"), "syllabified.txt")
def __init__(self, poem_title): assert poem_title in poetic_edda_titles TaggedCorpusReader.__init__( self, os.path.join(poetic_edda, poem_title, "txt_files", "pos"), "pos_tagged.txt")
print(str(corpus).replace('\\\\', '/')) print(' ', repr(corpus.fileids())[:60]) print(' ', repr(corpus.words()[:10])[:60]) root = make_testcorpus(a=""" This/det is/verb the/det first/adj sentence/noun ./punc Here/det is/verb another/adj sentence/noun ./punc Note/verb that/comp you/pron can/verb use/verb any/noun tag/noun set/noun This/det is/verb the/det second/adj paragraph/noun ./punc word/n without/adj a/det tag/noun :/: hello ./punc """, b=""" This/det is/verb the/det second/adj file/noun ./punc """) corpus = TaggedCorpusReader(root, list('ab')) print(corpus.fileids()) print(str(corpus.root) == str(root)) print(corpus.words()) print(corpus.sents()) # doctest: +ELLIPSIS print(corpus.paras()) # doctest: +ELLIPSIS print(corpus.tagged_words()) # doctest: +ELLIPSIS print(corpus.tagged_sents()) # doctest: +ELLIPSIS print(corpus.tagged_paras()) # doctest: +ELLIPSIS print(corpus.raw()[:40]) print(len(corpus.words()), [len(corpus.words(d)) for d in corpus.fileids()]) print(len(corpus.sents()), [len(corpus.sents(d)) for d in corpus.fileids()]) print(len(corpus.paras()), [len(corpus.paras(d)) for d in corpus.fileids()]) print(corpus.words('a')) print(corpus.words('b')) # del_testcorpus(root)
import nltk from nltk.corpus import ConllChunkCorpusReader from nltk.corpus.reader.tagged import TaggedCorpusReader root = '/usr/local/share/nltk_data/corpora/MASC-for-NE/' masc_for_ne = TaggedCorpusReader(root,'.*', '_') sents = masc_for_ne.tagged_sents() ne_sents = [nltk.ne_chunk(sent) for sent in sents] root = "/usr/local/share/nltk_data/corpora/masc_conll/" gold_corpus = ConllChunkCorpusReader(root,r".*\.conll", chunk_types=("DATE","PERSON","ORGANIZATION","LOCATION")) gold_sents = gold_corpus.chunked_sents()