Ejemplos de PlaintextCorpusReader.PlaintextCorpusReader en Python, ejemplos de nltk.corpus.reader.plaintext.PlaintextCorpusReader.PlaintextCorpusReader en Python

Ejemplo n.º 1

0

Mostrar archivo

Archivo: wikiterm.py Proyecto: hmetaxa/LiTe

def main():
    corpus_root = sys.argv[1]
    num_text_files = int(sys.argv[2])
    algorithm_type = sys.argv[3]
    pmi_freq_filter = int(sys.argv[4])
    file_list = []
    for i in range(0, num_text_files):
        file_list.append(sys.argv[5 + i])
    corpus = PlaintextCorpusReader(corpus_root, '.*')
    if 'bigram' in algorithm_type:
        measures = nltk.collocations.BigramAssocMeasures()
        finder = BigramCollocationFinder.from_words(corpus.words())
        finder.apply_freq_filter(pmi_freq_filter)
        scored = finder.score_ngrams((f(algorithm_type)))
    else:
        measures = nltk.collocations.TrigramAssocMeasures()
        finder = TrigramCollocationFinder.from_words(corpus.words())
        finder.apply_freq_filter(pmi_freq_filter)
        scored = finder.score_ngrams((f(algorithm_type)))

    sort = (sorted(scored, key=lambda tu: tu[1]))
    for key in sort:
        ngrams = len(key[0])
        if (ngrams == 2):
            print key[0][0] + "\t" + key[0][1] + "\t" + str(key[1])
        else:
            print key[0][0] + "\t" + key[0][1] + "\t" + key[0][2] + "\t" + str(
                key[1])

Ejemplo n.º 2

0

Mostrar archivo

Archivo: nlp.py Proyecto: MCodeLab/udn-nlp

def corpus_reader(corpus_name):
    ''' Open a PlaintextCorpusReader for the given UDN corpus.
    '''
    # If the user requested an unfiltered corpus version, we need to know the root corpus name
    root_corpus = corpus_name.replace('-unfiltered', '')

    # Ensure the desired corpus's submodule is checked out
    if not os.path.exists('./corpora/{}/README.md'.format(root_corpus)):
        retcode = subprocess.call(
            "git submodule update --init -- corpora/{}".format(
                root_corpus).split(" "))
        if retcode != 0:
            print(
                "Attempt to checkout submodule for corpus '{}'. Try running 'git submodule update --init' manually."
                .format(root_corpus))
            exit()

    percentage = ''
    with open('./corpora/{0}/{0}.txt'.format(root_corpus), 'r') as f:
        manifest = f.readlines()
        query = manifest[0].split(" ")[3]
        num_found = util.dry_make_request(query, 0, 1)[0]['numFound']
        num_in_corpus, last_one = util.files_in_dir('./corpora/{}/{}'.format(
            root_corpus, corpus_name))
        percentage = '{0:.0%}'.format(num_in_corpus / num_found)
        if percentage != '100%':
            print('NOTE: This corpus is only {} complete. Last file: {}\n'.
                  format(percentage, last_one))

    corpus = PlaintextCorpusReader(
        './corpora/{}/{}'.format(root_corpus, corpus_name), r'.*\.txt')
    return corpus

Ejemplo n.º 3

0

Mostrar archivo

Archivo: __init__.py Proyecto: Vvegetables/code-piece

def get_phrase():
    root_dir = r'E:\github_repo\python_basic\pythonbasictest\self_nltk\files'
    wordlists = PlaintextCorpusReader(root_dir,".*")
    x = nltk.Text(wordlists.words("test.txt"))
    print(x)
    
    print(x.collocations())

Ejemplo n.º 4

0

Mostrar archivo

Archivo: model.py Proyecto: JD95/supreme-court-analysis

def construct_models():
    """ Builds the classification models. """
    sources = [
        'Conservative',  # Scalia + Rehnquist
        'Progressive'
    ]  # Ginsburg + Stevens
    corpus = [(PlaintextCorpusReader('data/' + path + '/', '.*'), path)
              for path in sources]
    documents = []
    for (c, cat) in corpus:
        for fileid in c.fileids():
            documents.append((c.words(fileid), cat))

    random.shuffle(documents)

    all_words = []

    for (c, cat) in corpus:
        all_words.extend(c.words())

    all_words = nltk.FreqDist(all_words)
    word_features = list(all_words.keys())[:3000]
    featuresets = [(find_features(opinion, word_features), cat)
                   for (opinion, cat) in documents]

    training_subset = int(len(featuresets) * 0.9)
    training_set = featuresets[:training_subset]
    testing_set = featuresets[training_subset:]

    ensemble = EnsembleClassifer(training_set, testing_set)
    ensemble.show_most_useful_features()
    ensemble.accuracy()
    print(ensemble.classify(testing_set[0][0]))

Ejemplo n.º 5

0

Mostrar archivo

Archivo: create_corpus.py Proyecto: stevenvandenberghe/language_of_taste

def create_corpus(directory):
    corpus = PlaintextCorpusReader(directory,
                                   '.*',
                                   encoding="iso-8859-1",
                                   word_tokenizer=word_tokenize,
                                   sent_tokenizer=sent_tokenize)
    return corpus

Ejemplo n.º 6

0

Mostrar archivo

class App:

    def makeTrainingData (reader):
        for category in reader.categories():
            for file in reader.fileids(category):
                yield FreqDist(reader.words(fileids=[file])), category



    corpusDirectory = "../../resources/input/"
    #Was using PlaintextCorpusReader, switched to Categorized to provide categories
    wattsCorpus = PlaintextCorpusReader(corpusDirectory, '.*')

    print wattsCorpus.raw().strip()
    print wattsCorpus.words()
    for sentence in wattsCorpus.sents():
        print sentence
    print len(wattsCorpus.sents())
    text = nltk.tokenize.word_tokenize(wattsCorpus.raw())
    print "tokenized text: ", text

    #example of finding similar word
    text = nltk.Text(word.lower() for word in wattsCorpus.words())
    print "similar to god: ", text.similar('god')

    words = nltk.pos_tag(text)
    fdist = nltk.FreqDist(words)
    print "frequencey distribution: ", fdist

    sentence = "So there are two ways of playing the game. The first way, which is the usual way, is that a guru or teacher who wants "
    sentenceWords = nltk.word_tokenize(sentence)
    fdistForSentence = nltk.FreqDist(sentenceWords)
    fdistForSentence.plot()

Ejemplo n.º 7

0

Mostrar archivo

Archivo: comedy_corpus.py Proyecto: andrewcunnin/comedy_corpus

def load_corpus(race_code=None,
                gender_code=None
                ):  #loads corpora into an array based on race and gender

    if (race_code == None):  # if none is specified, search all
        race_code = ".."
    if (gender_code == None):
        gender_code = ".."

    reader = PlaintextCorpusReader(
        corpus_root, ".*_" + race_code + "_" + gender_code +
        "\.txt")  # uses filename encoding to load specified texts
    corpora = []

    for fileid in reader.fileids(
    ):  #creates ComedyCorpus object, populates with fileid and name
        new_corpus = ComedyCorpus()
        new_corpus.set_fileid(fileid)
        try:
            new_corpus.set_text(
                reader.raw(fileid))  #gets word content based on fileid
        except UnicodeDecodeError:
            continue
        fileid = re.sub("_" + race_code + "-" + gender_code + "\.txt", "",
                        fileid)
        #name is fileid without encoding
        fileid = fileid.replace("%20", " ")
        fileid = fileid.replace("_", "; ")
        print(fileid)
        new_corpus.set_name(fileid)
        corpora.append(new_corpus)

    return corpora

Ejemplo n.º 8

0

Mostrar archivo

Archivo: sickness_classifier_defines.py Proyecto: ssh-randy/sickly

def generateNgramModel(corpusPath, corpusName):
    corpusdir = 'corpora/'  # Directory of corpus.
    generatedCorpus = PlaintextCorpusReader(corpusPath, corpusName)
    estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2)
    ngrammodel = NgramModel(2, generatedCorpus.sents(), True, False,
                            estimator)  #uses bigrams just cause they BETTER
    return ngrammodel

Ejemplo n.º 9

0

Mostrar archivo

Archivo: corpusFromCommentary.py Proyecto: BitFloyd/commentaryanalysis

def getCorupsFromCorpusFile(CorpusFile):

    CorpusDir, CorpusFile = os.path.split(CorpusFile)

    corpus = PlaintextCorpusReader(CorpusDir, CorpusFile)

    return corpus

Ejemplo n.º 10

0

Mostrar archivo

Archivo: reasoner.py Proyecto: Nurtal/BIBOT

def create_corpus():
    ## Create corpus from abstract
    ## fetched by BIBOT
    ## return a corpus object

    ## Read the abstract result file
    abstract_to_content = {}
    abstract_file = open("fetched/pubmed_abstract.txt", "r")
    for line in abstract_file:
        line = line.replace("\n", "")
        if (line[0] == ">"):
            abstract = line[1:]
            abstract_to_content[abstract] = ""
        else:
            content = line
            abstract_to_content[abstract] = content
    abstract_file.close()

    ## create files
    for key in abstract_to_content.keys():
        text_file = open("fetched/corpus/" + str(key) + ".txt", "w")
        text_file.write(abstract_to_content[key])
        text_file.close()

    ## ntlk magical lines
    corpusdir = 'fetched/corpus/'
    newcorpus = PlaintextCorpusReader(corpusdir, '.*')

    return newcorpus

Ejemplo n.º 11

0

Mostrar archivo

Archivo: generate_word_list.py Proyecto: cogerk/rentbot

def generate_words_grammar():
    """
    Use sentence grammar to find words that could be Rent lyrics
    :return:
    """
    # Load corpuses to look in
    gentrification = PlaintextCorpusReader(
        'corpus', '.*')  # Gentrification articles are in this directory
    gentrify_sents = gentrification.sents()  #
    wine_sents = nltk.corpus.webtext.sents('wine.txt')
    corpus_sents = gentrify_sents + wine_sents
    syls_1 = []
    syls_2 = []
    syls_4 = []
    syls_2_sing = []
    for sent in corpus_sents:
        parsed_sent = nltk.pos_tag(sent)
        for word in parsed_sent:
            no_syls = count_syllables(word[0])
            if word[1] == 'NNS' and len(word[0]) > 3:
                if no_syls == 1:
                    syls_1 = syls_1 + [word[0].lower()]
                elif no_syls == 2:
                    syls_2 = syls_2 + [word[0].lower()]
                elif no_syls == 4:
                    syls_4 = syls_4 + [word[0].lower()]
            if word[1] == 'NN' and len(word[0]) > 2:
                if no_syls == 2:
                    syls_2_sing = syls_2_sing + [word[0].lower()]
    return list(set(syls_1)), list(set(syls_2)), list(set(syls_4)), list(
        set(syls_2_sing))

Ejemplo n.º 12

0

Mostrar archivo

def read_corpus(corpus_path):
    from nltk.corpus.reader.plaintext import PlaintextCorpusReader
    corpus = PlaintextCorpusReader(corpus_path, ".*\.txt")
    ctext = corpus.raw()
    #    with open('corpus.txt', 'w') as cf:
    #        cf.write(ctext.encode('utf-8'))
    return ctext

Ejemplo n.º 13

0

Mostrar archivo

def load_feat_data(dir_array):

    data_list = []

    for direct in dir_array:

        data = []

        corpus_dir = 'dataset/' + direct
        corpus = PlaintextCorpusReader(corpus_dir, '.*\.*')
        file_ids = corpus.fileids()

        for file in file_ids:
            text = corpus.raw(file)
            e = email.message_from_string(text)

            if (e.is_multipart()):
                for payload in e.get_payload:
                    text = payload.get_payload

            else:
                text = e.get_payload()

            data.append(extract_features(text, corpus, file))

        data_list.extend(data)

    return data_list

Ejemplo n.º 14

0

Mostrar archivo

    def __init__(self, master):
        '''	Constructor. master is a string that names a directory in the same repository that contains all the work from inspiration
		'''
        self.master = 'masters/' + master
        self.reader = PlaintextCorpusReader(self.master,
                                            r'.*',
                                            encoding='utf-8')
        self.text = self.reader.words()

Ejemplo n.º 15

0

Mostrar archivo

 def cv_to_matrix(self):
     corpusdir = 'data/cv_corpus'
     corpa = PlaintextCorpusReader(corpusdir,'.*',encoding='windows-1252')
     print("Preprocessing words....")
     sents = [[token.lemma_ for token in nlp(" ".join(self.clean(sent)).lower()) if token.lemma_ not in stopset] for sent in corpa.sents()]
     print("training word vectors....")
     model = Word2Vec(sents,window=5, size=self.ncol,min_count=1, workers=4)
     fname = get_tmpfile("vectors.kv")
     model.wv.save(fname)
     print("cv_to_matrix model saved")
     return model.wv

Ejemplo n.º 16

0

Mostrar archivo

 def build_d2v_model(self):
     print("Début de la construction du modèle Doc2Vec")
     corpusdir = 'data/cv_corpus'
     corpa = PlaintextCorpusReader(corpusdir, '.*',encoding='windows-1252')
     print("tokenizing...")
     resumes = [[token.lemma_  for sent in paras for token in nlp(" ".join(self.clean(sent)).lower()) if token.lemma_ not in stopset] for paras in  corpa.paras()]
     #print(resumes[0:3])
     print("tokenization completed")
     documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(resumes)]
     model = Doc2Vec(documents, vector_size=self.cv_length, window=5, min_count=1, workers=4)
     print("Fin de la construction du modèle Doc2Vec")
     return model

Ejemplo n.º 17

0

Mostrar archivo

 def __init__(self, data_root):
     self.data_root = data_root
     self.data = PlaintextCorpusReader(data_root, '.*')
     self.words = [i for i in self.data.words() if i.isalpha()]
     self.text = Text(self.words)
     self.stop = set(stopwords.words('english')).union({
         'cid', 'et', 'al', 'also', 'and', 'editingboston', 'arxiv',
         'pages', 'trackboston', 'preprint', 'page', 'vol', 'volume',
         'march', 'boston', 'table'
     })
     with open('bib.json') as fi:
         self.bib = json.load(fi)

Ejemplo n.º 18

0

Mostrar archivo

Archivo: dimensional_reduction.py Proyecto: noelschu/Capstone

def get_fileid_lst(source_dir):
    '''
    Use NLTK to pull in the list of file ids in the given source directory

    :param {str} source_dir:
        The relative path to the source directory that contains all the data (book) files
    :return {str} fileid_lst:
        List of all file id's ending in '.txt' in the source_dir
    '''
    temp_corp = PlaintextCorpusReader(source_dir, '.*\.txt')
    fileid_lst = temp_corp.fileids()

    return fileid_lst

Ejemplo n.º 19

0

Mostrar archivo

 def token_in_coverage(self):
     corpusdir = 'data/cv_corpus'
     corpa = PlaintextCorpusReader(corpusdir, '.*',encoding='windows-1252')
     resumes = [[item for sent in paras for item in sent] for paras in corpa.paras()]
     cpt=0
     for resume in resumes :
         resume_text = " ".join(resume)
         resume_sents = nltk.sent_tokenize(resume_text)
         resume_words = set(token.lemma_ for sent in resume_sents for token in nlp(" ".join(sent).lower()))
         if not resume_words.isdisjoint(self.tokens_in) :
             cpt+=1
     coverage = cpt*1.0/len(resumes)
     print("token_in coverage : {}".format(coverage))

Ejemplo n.º 20

0

Mostrar archivo

    def __init__(self,
                 input_folder_name,
                 doc_pattern,
                 categ_pattern,
                 encoding='utf-8'):
        CategorizedPlaintextCorpusReader.__init__(self,
                                                  input_folder_name,
                                                  doc_pattern,
                                                  cat_pattern=categ_pattern)
        self.input_folder_name = input_folder_name
        self.encoding = encoding
        self.root_reader = PlaintextCorpusReader(input_folder_name,
                                                 fileids=r'[^\/]*.' +
                                                 doc_pattern[-3:])
        #self.root_ids =[ os.path.join(input_folder_name,item) for item in self.root_reader.fileids()]

        self.root_ids = list(self.root_reader.fileids())

Ejemplo n.º 21

0

Mostrar archivo

Archivo: process_corpus12.py Proyecto: venkatram64/python_nltk

def processFile(newCorpusDir):
    if not os.path.isdir(newCorpusDir):
        os.mkdir(newCorpusDir)
    txt1 = getText('sample_feed.txt')
    txt2 = pdf.getTextPDF('VirtualBoxTroubleshooting.pdf')
    txt3 = word.getTextWord('my_doc.docx')

    files = [txt1, txt2, txt3]
    for idx, f in enumerate(files):
        with open(newCorpusDir + str(idx) + '.txt', 'w') as fout:
            fout.write(f)

    newCorpus = PlaintextCorpusReader(newCorpusDir, '.*')

    print(newCorpus.words())
    print(newCorpus.sents(newCorpus.fileids()[1]))
    print(newCorpus.paras(newCorpus.fileids()[0]))

Ejemplo n.º 22

0

Mostrar archivo

def load_data(dir_label):

    data_list = []
    labels = []

    for dl in dir_label:

        data = []

        directory = dl[0]
        label = dl[1]

        corpus_dir = 'dataset/' + directory
        corpus = PlaintextCorpusReader(corpus_dir, '.*\.*')
        file_ids = corpus.fileids()

        for file in file_ids:

            d = []

            text = corpus.raw(file)
            e = email.message_from_string(text)

            if (e.is_multipart()):
                for payload in e.get_payload:
                    text = payload.get_payload
            else:
                text = e.get_payload()

            feats = [
                cf.charac_feats_extractor(text),
                wf.word_feats_extractor(text),
                syf.syntac_feats_extractor(text),
                stf.struct_feats_extractor(corpus, file, text),
                fwf.funct_word_feats_extractor(text)
            ]

            for f in feats:
                d.extend(list(f.values()))

            data.append(d)
            labels.append(label)

        data_list.extend(data)

    return [data_list, labels]

Ejemplo n.º 23

0

Mostrar archivo

def pdf_to_corpus():
    path = 'D://Eclipse Workspace//NLP//Assignment//res//'

    for filename in glob.glob(os.path.join(path, '*.pdf')):
        print(filename)
        pdfFileObj = open(filename, 'rb')

        # creating a pdf reader object
        pdfReader = PyPDF2.PdfFileReader(pdfFileObj)

        # printing number of pages in pdf file
        print(pdfReader.numPages)

        # creating a page object
        pageObj = pdfReader.getPage(0)

        # extracting text from page
        text = pageObj.extractText()

        strings_list = text.split("\n")
        # Make new dir for the corpus.
        corpusdir = 'customcorpus/'
        if not os.path.isdir(corpusdir):
            os.mkdir(corpusdir)

        # Output the files into the directory.
        file_name = filename.split("\\")[-1]

        print(file_name)
        pbar = ProgressBar(widgets=[
            'Creating Corpus',
            Bar('#', '[', ']'), ' ',
            Percentage(), ' ',
            ETA()
        ],
                           maxval=100)
        for text in pbar(strings_list):
            with open(corpusdir + '[PDF] ' + file_name + '.txt', 'ab') as fout:
                fout.write(text.encode('utf-8'))
        pbar.finish()

        #create_corpus(text)
        corpus = PlaintextCorpusReader('customcorpus/', '.*')

        print(corpus.raw())

Ejemplo n.º 24

0

Mostrar archivo

def token_assamese():
    # Modifiy these to change the location of the coupus file  and the name of  the courpus  file
    corpus_path = "/Users/partha/All/Python/ProjectMaterials/Learned material/Arts"
    corpus_filename = 'Psychology.txt'

    newcorpus = PlaintextCorpusReader(corpus_path,
                                      corpus_filename,
                                      encoding='utf16')
    text = newcorpus.raw().strip().replace('ред', '.')
    words = nltk.word_tokenize(text)

    for index, item in enumerate(words):
        if (str(item) == '.'):
            words[index] = 'ред'

    output_file_path = "C:/Users/HEMANT/Documents/1.Project/"
    output_filename = 'Result.txt'

    with open(output_file_path + output_filename, 'w', encoding='utf8') as f:
        for i in words:
            f.writelines(str(i) + '\n')

    f.close()

Ejemplo n.º 25

0

Mostrar archivo

Archivo: P_Text2html.py Proyecto: Sdepc/Supplierdataextraction_service

def Read_corpus(path_c, fname_c, fo1):
    import nltk
    import re
    import spacy
    import en_core_web_sm
    import fileinput
    nlp = spacy.load('en_core_web_sm')
    from nltk.corpus.reader.plaintext import PlaintextCorpusReader

    pcorpus = PlaintextCorpusReader(path_c, fname_c, encoding="utf")

    #HTML Tags to file
    fappend(fo1, P_htmltag.writehtmltag1(fname_c), fname_c)

    # Iterate through each paragraph
    for para in pcorpus.paras():
        L0 = rep_tags(para)
        L1 = L0.split("\n")
        for i, w in enumerate(L1):
            if (w != ""):
                ApplyNLP(nlp(str(w[1:])), fo1)

    fappend(fo1, P_htmltag.writehtmltag3(fname_c), fname_c)

Ejemplo n.º 26

0

Mostrar archivo

Archivo: summarizer.py Proyecto: rmswny/article_bias

def read_article(file_path):
    #file = open(file_path, "r")
    ##INSERT FILE NAME IN FUNCTION CALL BELOW######
    bcr = PlaintextCorpusReader(file_path, 'bernie.txt')
    #filedata = file.read()
    filedata = bcr.raw()
    #for word in filedata.split():
    #    if word == 'Mr.':
    #        filedata[word] = 'Mr'
    article = filedata.replace("\n\n", '. ').replace('Mr.', 'Mr').replace(
        "\r", ' ').replace('\n', ' ').split('. ')
    articlez = []
    for line in article:
        if line == '':
            continue
        if line[0] == '\n':
            line = line[1:]
        articlez.append(line)
    sentences = []
    for sentence in articlez:
        sentences.append(sentence.replace("[^a-zA-Z]", " ").split(" "))
    sentences.pop()

    return sentences

Ejemplo n.º 27

0

Mostrar archivo


# new file with weightings
new_file = open(new_file_name, "w+", encoding="utf-8")
more_stopwords = open("stopwords.txt", "r", encoding="utf-8")
stop_words = set(nltk.corpus.stopwords.words('english'))
for line in more_stopwords:
    stop_words.add(line[:-1])
    #words = line.split()
    #for word in words:
        #stop_words.add(word)
regex = re.compile(r'(?:^|)[a-zA-Z0-9\-]+')
not_regex = re.compile(r'\@[a-zA-Z0-9\-]+')
#print(stop_words)

texts = PlaintextCorpusReader(CORPUS_TEXT, '.*\.txt')

def extract_candidate_chunks(text, grammar=r'KT: {(<JJ>* <NN.*>+ <IN>)? <JJ>* <NN.*>+}'):
    #pdb.set_trace()
    #print(stop_words)
    # exclude candidates that are stop words or entirely punctuation
    punct = set(string.punctuation)
    # tokenize, POS-tag, and chunk using regular expressions
    chunker = nltk.chunk.regexp.RegexpParser(grammar)
    tagged_sents = nltk.pos_tag_sents(nltk.word_tokenize(sent) for sent in nltk.sent_tokenize(text))
    all_chunks = list(itertools.chain.from_iterable(nltk.chunk.tree2conlltags(chunker.parse(tagged_sent))
                                                    for tagged_sent in tagged_sents))
    # join constituent chunk words into a single chunked phrase
    lambda_func = lambda w_p_c: w_p_c[2] != 'O'
    candidates = [' '.join(word for word, pos, chunk in group).lower()
                  for key, group in itertools.groupby(all_chunks, lambda_func) if key]

Ejemplo n.º 28

0

Mostrar archivo

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
from nltk.corpus.reader.plaintext import PlaintextCorpusReader
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

from wordcloud import STOPWORDS
_stop_words = set(STOPWORDS)

stop_words = set(stopwords.words('english'))
stop_words.update(_stop_words, ('thing', 'u', 'us', 'nt'))
lemmatizer = WordNetLemmatizer()

# Read .txt files from ./docs directory into a corpus
corpus = PlaintextCorpusReader('./docs/', ".*\.txt")

# filter list of words to remove uneeded ones and punctuation
# losing U.S. which is not ideal, tried splitting sentences on spaces and preserving dots just for it

from nltk.tokenize import TweetTokenizer
tokenizer = TweetTokenizer(strip_handles=True, reduce_len=True)
tokenized = tokenizer.tokenize(corpus.raw())

# drop punctuation
non_punct = list(
    filter(lambda token: nltk.tokenize.punkt.PunktToken(token).is_non_punct,
           tokenized))

# lowercase everything
lowercased = [word.lower() for word in non_punct]

Ejemplo n.º 29

0

Mostrar archivo

Archivo: views.py Proyecto: CristianPoma16/DetectarMuletillasFinal

def detect(request):
    #Entrada de datos
    if request.method == 'POST':
        identificacion=request.POST.get('dni')
        a=request.FILES['document']
        documento=str(a)
        datos_doc=documento.split('.')
        nombre_doc=datos_doc[0]
        tipo_doc=datos_doc[1]
        if tipo_doc=='txt':
            name=request.FILES['document'].read().lower()
            print(datos_doc)
            #mul=set(stopwords.words("spanish"))
            mul=codecs.open('mul.txt', "r", encoding='UTF-8').read()
            remove('muletillas.txt')
            discurso=(name.decode('UTF-8'))
            #Separar muletillas de palabras comunes
            text_completo = wordpunct_tokenize(discurso)
            m = []
            m = [w for w in text_completo if w in mul]
            
            muletillas= codecs.open('muletillas.txt', "a")
            for i in m:
                muletillas.write(i)
                muletillas.write(" ")
                
            muletillas.close()
            #Contabilizar muletillas 
            tokenizador=RegexpTokenizer('\w+|[^\w\s]+')

            corpus = PlaintextCorpusReader(".", 'muletillas.txt',word_tokenizer=tokenizador, encoding='Latin-1')
            
            frecuencia=FreqDist(corpus.words())
            salida=codecs.open("muletillasR.txt","w",encoding="utf-8")
            palabras=[]
            repeticiones=[]
            #Agregar los datos extraidos en un txt para posterior presentacion
            for mc in frecuencia.most_common(): 
                palabra=mc[0]
                frecuencia_absoluta=mc[1]
                frecuencia_relativa=frecuencia.freq(palabra)
                cadena=str(frecuencia_absoluta)+"\t"+str(frecuencia_relativa)+"\t"+palabra  
                
                palabras.append(palabra.upper()) 
                repeticiones.append(frecuencia_absoluta)  
                salida.write(cadena+"\n")
            try:
                collection.insert_one({
                    'identificacion':identificacion,
                    'documento': documento,
                    'discurso':discurso,
                    'muletillas':palabras
                })
            except Exception as e:
                print("Error : ", type(e), e)
            #Enviado de datos al front
            context={
                'documento': nombre_doc,
                'muletillas':palabras[0:10],
                'repeticiones': repeticiones[0:10]
            }
            return render(request, 'responde.html', context)
        else :
            messages.warning(request, "Verifique el tipo de archivo", extra_tags='file')
            return render(request, 'home.html')
    return render(request, 'home.html')





# class LineChartJSONView(BaseLineChartView):
#     def get_labels():
#         """Return 7 labels for the x-axis."""
#         return ["January", "February", "March", "April", "May", "June","July", "August", "September", "October"]

#     def get_providers(self):
#         """Return names of datasets."""
#         return ["Repeticiones"]

#     def get_data(self):
#         """Return 3 datasets to plot."""

#         return [[75, 44, 92, 11, 44, 95, 35, 11, 44, 95, 35]]


# line_chart = TemplateView.as_view(template_name='responde.html')
# line_chart_json = LineChartJSONView.as_view()

Ejemplo n.º 30

0

Mostrar archivo

import os
from nltk.corpus.reader.plaintext import PlaintextCorpusReader
from nltk.probability import LidstoneProbDist, WittenBellProbDist
from nltk.model import NgramModel
from nltk.tokenize import sent_tokenize, word_tokenize


corpusdir = 'corpora/' # Directory of corpus.
SickCorpus = PlaintextCorpusReader(corpusPath, 'sick_tweets.txt')
HealthyCorpus = PlaintextCorpusReader(corpusdir, 'healthy_tweets.txt')
estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2)
    


estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2)

sick_model_1 = NgramModel(1, SickCorpus.sents(), True, False, estimator)
sick_model_2 = NgramModel(2, SickCorpus.sents(), True, False, estimator)

healthy_model_1 = NgramModel(1, HealthyCorpus.sents(), True, False, estimator)
healthy_model_2 = NgramModel(2, HealthyCorpus.sents(), True, False, estimator)

tweet = "Remember when we were all diagnosed with Bieber fever ? Lol"

print "sick_model_1 is: " + str(sick_model_1.perplexity(word_tokenize(tweet)))
print "sick_model_2 is: " + str(sick_model_2.perplexity(word_tokenize(tweet)))
print "healthy_model_1 is: " + str(healthy_model_1.perplexity(word_tokenize(tweet)))
print "healthy_model_2 is: " + str(healthy_model_2.perplexity(word_tokenize(tweet)))