コード例 #1
0
def plaintext_to_conll(inpath,
                       postag=False,
                       lemmatise=False,
                       lang='en',
                       metadata=False,
                       outpath=False,
                       nltk_data_path=False,
                       speaker_segmentation=False):
    """
    Take a plaintext corpus and sent/word tokenise.

    :param inpath: The corpus to read in
    :param postag: do POS tagging?
    :param lemmatise: do lemmatisation?
    :param lang: choose language for pos/lemmatiser (not implemented yet)
    :param metadata: add metadata to conll (not implemented yet)
    :param outpath: custom name for the resulting corpus
    :param speaker_segmentation: did the corpus has speaker names?
    """

    import nltk
    import shutil
    import pandas as pd
    from corpkit.process import saferead

    from corpkit.build import get_filepaths
    fps = get_filepaths(inpath, 'txt')

    # IN THE SECTIONS BELOW, WE COULD ADD MULTILINGUAL
    # ANNOTATORS, PROVIDED THEY BEHAVE AS THE NLTK ONES DO

    # SENT TOKENISERS
    from nltk.tokenize.punkt import PunktSentenceTokenizer
    stoker = PunktSentenceTokenizer()
    s_tokers = {'en': stoker}
    sent_tokenizer = s_tokers.get(lang, stoker)

    # WORD TOKENISERS
    tokenisers = {'en': nltk.word_tokenize}
    tokeniser = tokenisers.get(lang, nltk.word_tokenize)

    # LEMMATISERS
    if lemmatise:
        from nltk.stem.wordnet import WordNetLemmatizer
        lmtzr = WordNetLemmatizer()
        lemmatisers = {'en': lmtzr}
        lemmatiser = lemmatisers.get(lang, lmtzr)

    # POS TAGGERS
    if postag:
        # nltk.download('averaged_perceptron_tagger')
        postaggers = {'en': nltk.pos_tag}
        tagger = postaggers.get(lang, nltk.pos_tag)

    # iterate over files, make df of each, convert this
    # to conll and sent to new filename
    for f in fps:
        for_df = []
        data, enc = saferead(f)
        plain, enc = saferead(f.replace('-stripped', ''))
        #orig_data = data
        #data, offsets = process_meta(data, speaker_segmentation, metadata)
        #nest = []
        sents = sent_tokenizer.tokenize(data)
        soffs = sent_tokenizer.span_tokenize(data)
        toks = [tokeniser(sent) for sent in sents]
        ser = nested_list_to_pandas(toks)
        for_df.append(ser)
        if postag or lemmatise:
            postags = pos_tag_series(ser, tagger)
        if lemmatise:
            lemma = lemmatise_series(ser, postags, lemmatiser)
            for_df.append(lemma)
            for_df.append(postags)
        else:
            if postag:
                for_df.append(postags)
        df = pd.concat(for_df, axis=1)
        fo = new_fname(f, inpath)
        write_df_to_conll(df,
                          fo,
                          metadata=metadata,
                          plain=plain,
                          stripped=data,
                          speaker_segmentation=speaker_segmentation,
                          offsets=soffs)
        nsent = len(set(df.index.labels[0]))
        print('%s created (%d sentences)' % (fo, nsent))

    if '-stripped' in inpath:
        return inpath.replace('-stripped', '-tokenised')
    else:
        return inpath + '-tokenised'
コード例 #2
0
def sent_pos(in_dir):
    """ Positions of citation markers in sentences, relatve to where in doc
    """

    arxiv_base_url = 'http://export.arxiv.org/api/query?search_query=id:'
    arxiv_ns = {
        'atom': 'http://www.w3.org/2005/Atom',
        'opensearch': 'http://a9.com/-/spec/opensearch/1.1/',
        'arxiv': 'http://arxiv.org/schemas/atom'
    }

    punkt_param = PunktParameters()
    abbreviation = ['al', 'fig', 'e.g', 'i.e', 'eq', 'cf']
    punkt_param.abbrev_types = set(abbreviation)
    tokenizer = PunktSentenceTokenizer(punkt_param)

    with open('hedge_words') as f:
        hedge_words = [l.strip() for l in f.readlines()]

    x_all = list(range(-5, 6))
    y_verb = []
    y_noun = []
    y_propnoun = []
    y_prepos = []
    y_adj = []
    y_wh = []
    y_adv = []
    y_pr = []
    y_form = []
    y_fig = []
    y_tab = []
    for x in x_all:
        y_verb.append(0)
        y_noun.append(0)
        y_propnoun.append(0)
        y_prepos.append(0)
        y_adj.append(0)
        y_wh.append(0)
        y_adv.append(0)
        y_pr.append(0)
        y_form.append(0)
        y_fig.append(0)
        y_tab.append(0)
    file_names = os.listdir(in_dir)
    for file_idx, fn in enumerate(file_names):
        if file_idx % 100 == 0:
            print('{}/{}'.format(file_idx, len(file_names)))
        path = os.path.join(in_dir, fn)
        aid, ext = os.path.splitext(fn)
        if ext != '.txt' or aid == 'log':
            continue

        phys_cat = [
            'hep-th', 'hep-ph', 'hep-lat', 'hep-ex', 'cond-mat', 'astro-ph',
            'physics', 'nucl', 'gr-qc', 'quant-ph', 'nlin'
        ]
        math_cat = ['math', 'math-ph']
        cs_cat = ['cs']
        if re.search(r'[a-z]', aid):
            split = re.search(r'[a-z][0-9]', aid).span()[0] + 1
            aid = aid[:split] + '/' + aid[split:]
        resp = requests.get('{}{}&start=0&max_results=1'.format(
            arxiv_base_url, aid))
        xml_root = etree.fromstring(resp.text.encode('utf-8'))
        result_elems = xml_root.xpath('/atom:feed/atom:entry',
                                      namespaces=arxiv_ns)
        result = result_elems[0]
        cat = result.find('arxiv:primary_category',
                          namespaces=arxiv_ns).get('term')
        high_cat = None
        for pc in phys_cat:
            if pc in cat:
                high_cat = 'phys'
                break
        if not high_cat:
            for mc in math_cat:
                if pc in cat:
                    high_cat = 'math'
                    break
        if not high_cat:
            if 'cs' in cat:
                high_cat = 'cs'
        if not high_cat:
            continue

        if high_cat != 'phys':
            continue

        with open(path) as f:
            text = f.read()

        marker = ' \u241F '
        doc_len = len(text)
        for sent_idx, sent_edx in tokenizer.span_tokenize(text):
            sentence_orig = text[sent_idx:sent_edx]
            sentence = re.sub(CITE_MULTI_PATT, marker, sentence_orig)
            sentence = re.sub(QUOTE_PATT, ' {}.'.format(marker), sentence)
            if marker in sentence:
                words = pos_tag(sentence.split())
                words = [w for w in words if re.search(r'[\w|\u241F]', w[0])]
                sent_len = len(words)
                indices = [
                    i for i, tup in enumerate(words)
                    if tup[0] == marker.strip()
                ]
                for word_idx in indices:
                    word = words[word_idx][0]
                    if word == marker.strip():
                        for shift in x_all:
                            x_idx = shift + 5
                            if shift == 0:
                                # marker itself
                                continue
                            if word_idx+shift < 0 or \
                                    word_idx+shift >= len(words):
                                # out of range
                                continue
                            wrd = words[word_idx + shift][0]
                            pos = words[word_idx + shift][1]
                            if 'V' in pos:
                                y_verb[x_idx] += 1
                            if pos in ['NN', 'NNS']:
                                y_noun[x_idx] += 1
                            if pos in ['NNP', 'NNPS']:
                                y_propnoun[x_idx] += 1
                            if pos == 'IN':
                                y_prepos[x_idx] += 1
                            if 'JJ' in pos:
                                y_adj[x_idx] += 1
                            if 'W' in pos:
                                y_wh[x_idx] += 1
                            if 'RB' in pos:
                                y_adv[x_idx] += 1
                            if 'PR' in pos:
                                y_pr[x_idx] += 1
                            if wrd == 'FORMULA':
                                y_form[x_idx] += 1
                            if wrd == 'FIGURE':
                                y_fig[x_idx] += 1
                            if wrd == 'TABLE':
                                y_tab[x_idx] += 1
        if file_idx > 200:
            break

    for idx, y in enumerate([(y_verb, 'verb'), (y_noun, 'noun'),
                             (y_propnoun, 'proper noun'),
                             (y_prepos, 'preposition'), (y_adj, 'adjective'),
                             (y_wh, 'wh-det./-adv./-pron.'), (y_adv, 'adverb'),
                             (y_pr, 'pers./pos. pronoun'),
                             (y_form, 'formula')]):
        color = list(mpl.rcParams['axes.prop_cycle'])[idx]['color']
        plt.plot(x_all,
                 y[0],
                 marker='',
                 linestyle='-',
                 linewidth=.5,
                 alpha=0.3,
                 color=color)
        plt.plot(x_all,
                 y[0],
                 label=y[1],
                 marker='D',
                 linestyle='',
                 color=color)

    plt.xlabel('word position relative to citation')
    plt.ylabel('number of words')
    plt.legend()

    ax = plt.gca()
    ax.xaxis.grid(True)
    plt.xticks(np.arange(min(x_all), max(x_all), 1.0))

    plt.show()
コード例 #3
0
train = False
if train:
    with gzip.open("en_corp", 'rt', encoding='utf-8') as encorp, gzip.open(
            "de_corp", 'rt', encoding='utf-8') as decorp:
        text_en = encorp.read()
        text_de = decorp.read()

    trainer_en = PunktTrainer()
    trainer_en.INCLUDE_ALL_COLLOCS = True
    trainer_en.train(text_en)

    trainer_de = PunktTrainer()
    trainer_de.INCLUDE_ALL_COLLOCS = True
    trainer_de.train(text_de)

    tokenizer_en = PunktSentenceTokenizer(trainer_en.get_params())
    tokenizer_de = PunktSentenceTokenizer(trainer_de.get_params())
else:
    #tokenizer_en=PunktSentenceTokenizer()
    #tokenizer_de=PunktSentenceTokenizer()
    #nltk.download('punkt')
    tokenizer_en = nltk.data.load('tokenizers/punkt/english.pickle')
    tokenizer_de = nltk.data.load('tokenizers/punkt/german.pickle')

mismatch = 0
with gzip.open(sys.argv[1], 'rt', encoding='utf-8') as filtered:
    for line in filtered:
        tabs = line.split('\t')
        line_src = tabs[3]
        line_tgt = tabs[4]
        sent_src = tokenizer_en.tokenize(line_src)
コード例 #4
0
ファイル: word.py プロジェクト: todd-cook/cltkv1
 def __init__(self):
     self.language = "latin"
     self.punkt_param = PunktParameters()
     self.punkt_param.abbrev_types = set(ABBREVIATIONS)
     self.sent_tokenizer = PunktSentenceTokenizer(self.punkt_param)
     self.word_tokenizer = LatinLanguageVars()
コード例 #5
0
ファイル: preprocessor.py プロジェクト: johndpope/SumMe
#edit this when changind dirs
LangPaths = os.path.realpath(
    "C:/users/rihanna/Documents/Pol/ThesisIt/SumMe/Summarizer/langdetector/profiles/"
)
tltagger = nltk.data.load("taggers/filipino_aubt.pickle")  #filipino pos tagger

tlChunker = nltk.data.load(
    "chunkers/filipino_ub.pickle")  #filipino chunker here
enChunker = nltk.data.load("chunkers/conll2000_ub.pickle")  #enChunkerhere

punkt_param = PunktParameters()  #creates an opening for tokenizer parameters.
punkt_param.abbrev_types = set(['gng', 'mr', 'mrs', 'dr', 'rep'
                                ])  #abbreviations further accepted goes here

sentence_splitter = PunktSentenceTokenizer(punkt_param)
tokenized = ""
gateway = JavaGateway()
detector = gateway.entry_point
detector.init(LangPaths)


def LangDetect(str):
    return detector.detect(str)


def tokenizer(str):

    #print(wordpunct_tokenize(str))
    return wordpunct_tokenize(str)
コード例 #6
0
def tokenize_latin_words(string):
    """
    Tokenizer divides the string into a list of substrings
  
    >>> from cltk.corpus.utils.formatter import remove_non_ascii
    >>> text =  'Dices ἐστιν ἐμός pulchrum esse inimicos ulcisci.'
    >>> tokenize_latin_words(text)
    ['Dices', 'ἐστιν', 'ἐμός', 'pulchrum', 'esse', 'inimicos', 'ulcisci', '.']
  
    :param string: This accepts the string value that needs to be tokenized
    :returns: A list of substrings extracted from the string
    """
    from cltk.tokenize.latin_exceptions import latin_exceptions

    assert isinstance(string, str), "Incoming string must be type str."

    def matchcase(word):
        # From Python Cookbook
        def replace(m):
            text = m.group()
            if text.isupper():
                return word.upper()
            elif text.islower():
                return word.lower()
            elif text[0].isupper():
                return word.capitalize()
            else:
                return word

        return replace

    replacements = [(r'mecum', 'cum me'),
                    (r'tecum', 'cum te'),
                    (r'secum', 'cum se'),
                    (r'nobiscum', 'cum nobis'),
                    (r'vobiscum', 'cum vobis'),
                    (r'quocum', 'cum quo'),
                    (r'quacum', 'cum qua'),
                    (r'quicum', 'cum qui'),
                    (r'quibuscum', 'cum quibus'),
                    (r'sodes', 'si audes'),
                    (r'satin', 'satis ne'),
                    (r'scin', 'scis ne'),
                    (r'sultis', 'si vultis'),
                    (r'similist', 'similis est'),
                    (r'qualist', 'qualis est')
                    ]

    for replacement in replacements:
        string = re.sub(replacement[0], matchcase(replacement[1]), string, flags=re.IGNORECASE)


    punkt_param = PunktParameters()
    abbreviations = ['c', 'l', 'm', 'p', 'q', 't', 'ti', 'sex', 'a', 'd', 'cn', 'sp', "m'", 'ser', 'ap', 'n', 'v', 'k', 'mam', 'post', 'f', 'oct', 'opet', 'paul', 'pro', 'sert', 'st', 'sta', 'v', 'vol', 'vop']
    punkt_param.abbrev_types = set(abbreviations)
    sent_tokenizer = PunktSentenceTokenizer(punkt_param)

    word_tokenizer = PunktLanguageVars()
    sents = sent_tokenizer.tokenize(string)

    enclitics = ['que', 'n', 'ue', 've', 'st']
    exceptions = enclitics
    exceptions = list(set(exceptions + latin_exceptions))

    tokens = []

    for sent in sents:
        temp_tokens = word_tokenizer.word_tokenize(sent)
        # Need to check that tokens exist before handling them; needed to make stream.readlines work in PlaintextCorpusReader
        
        if temp_tokens:
            if temp_tokens[0].endswith('ne'):
                if temp_tokens[0].lower() not in exceptions:
                    temp = [temp_tokens[0][:-2], '-ne']
                    temp_tokens = temp + temp_tokens[1:]

            if temp_tokens[-1].endswith('.'):
                final_word = temp_tokens[-1][:-1]
                del temp_tokens[-1]
                temp_tokens += [final_word, '.']

            for token in temp_tokens:
                tokens.append(token)

    # Break enclitic handling into own function?
    specific_tokens = []

    for token in tokens:
        is_enclitic = False
        if token.lower() not in exceptions:
            for enclitic in enclitics:
                if token.endswith(enclitic):
                    if enclitic == 'n':
                        specific_tokens += [token[:-len(enclitic)]] + ['-ne']
                    elif enclitic == 'st':
                        if token.endswith('ust'):
                            specific_tokens += [token[:-len(enclitic) + 1]] + ['est']
                        else:
                            specific_tokens += [token[:-len(enclitic)]] + ['est']
                    else:
                        specific_tokens += [token[:-len(enclitic)]] + ['-' + enclitic]
                    is_enclitic = True
                    break
        if not is_enclitic:
            specific_tokens.append(token)

    return specific_tokens
コード例 #7
0
 def __init__(self):
     self.name = "Gale Church Alignment Scorer"
     self.tokenizer = PunktSentenceTokenizer()
     self.sblocks, self.tblocks = [], []
def tokenize(document):
    doc_tokenizer = PunktSentenceTokenizer()
    sentences_list = doc_tokenizer.tokenize(document)
    return sentences_list
コード例 #9
0
def main(sysargs):
    sys.argv = sysargs
    arg_parser = argparse.ArgumentParser(
        description='Formats debates by removing HTML and filtering words.')
    arg_parser.add_argument('-i',
                            '--infile',
                            required=True,
                            help='Debate file to format.')
    args = arg_parser.parse_args()

    # Initialize nltk elements.
    parser = SpeechHTMLParser()
    sent_splitter = PunktSentenceTokenizer()
    tokenizer = TreebankWordTokenizer()
    tagger_loc = '/het/users/jengi/stanford-postagger/'
    tagger = StanfordTagger(tagger_loc + 'models/wsj-0-18-bidirectional-distsim.tagger', \
                                tagger_loc + 'stanford-postagger.jar')
    stemmer = SnowballStemmer('english')

    # Read infile.
    speaker_pattern = re.compile('.*:')
    null_pattern = re.compile('\s*(\[[^\]]*\]|\([^\)]*\))')
    dash_pattern = re.compile('\S+(--)\s+')
    ellipse_pattern = re.compile('\s*\.\.\.\s*')
    noun_tags = ['NN', 'NNS', 'NNP', 'NNPS']
    punct = ['!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', \
                 '-', '.', '/', ':', ';', '<', '=', '>', '?', '@', '[', \
                 '\\', ']', '^', '_', '`', '{', '|', '}', '~']
    block_lengths = []
    with open(args.infile, 'r') as afile:
        file_contents = afile.read()
        parser.feed(file_contents)
        parser.close()

        num_blocks = 0
        speeches = {}
        for (speaker, block) in parser.text:
            if num_blocks % 10 == 0:
                print >> sys.stderr, 'Processing block ' + str(
                    num_blocks) + ' ...'
            orig_block = block

            # Remove applause, laughter, etc.
            block = repeated_search(block, null_pattern, 0)

            # Remove -- from the end of words.  (Indicates stuttering / stopping.)
            block = repeated_search(block, dash_pattern, 1)

            # Do more complex tokenization.
            sents = sent_splitter.tokenize(block)
            sents = [ellipse_pattern.sub(' ... ', sent) for sent in sents]
            tokens = [tokenizer.tokenize(sent) for sent in sents]

            # Run POS tagger and keep only nouns.
            # Also lowercase and stem these nouns.
            tags = [tagger.tag(toks) for toks in tokens]
            tokens = []
            tagged_text = []
            for sent in tags:
                tokens.append([])
                for (word, tag) in sent:
                    tagged_text.append(word)
                    tagged_text.append(tag)
                    if tag in noun_tags:
                        tokens[len(tokens) - 1].append(
                            stemmer.stem(word.lower()))

            # Remove any "sentences" that are actually empty and
            # any tokens that are pure punctuation.
            for i in reversed(range(len(tokens))):
                for j in reversed(range(len(tokens[i]))):
                    non_punct = ''.join(
                        [tok for tok in tokens[i][j] if tok not in punct])
                    if len(non_punct) == 0:
                        del tokens[i][j]

                if len(tokens[i]) == 0:
                    del tokens[i]

            # Make sure there is still at least one sentence left.
            num_sents = len(tokens)
            if num_sents == 0:
                continue

            # Add block to speeches dictionary.
            speaker = speaker[:speaker_pattern.match(speaker).end() - 1]
            if speaker not in speeches:
                speeches[speaker] = []
            speeches[speaker].append(orig_block)
            speeches[speaker].append(' '.join(tagged_text))
            speeches[speaker].append('\n'.join(
                [' '.join(sent) for sent in tokens]))
            #print speeches[speaker][0]
            #print speeches[speaker][1]
            #print speeches[speaker][2]

            num_blocks += 1
            num_tokens = 0
            for toks in tokens:
                num_tokens += len(toks)
            block_lengths.append(num_tokens)

    # Save each speaker's text to a file.
    (infolder, basename) = os.path.split(os.path.abspath(args.infile))
    out_prefix = infolder + '/'
    out_suffix = basename
    for speaker in speeches:
        # Create outfile prefixed by speaker's name.
        outfile = open(out_prefix + speaker + '-' + out_suffix, 'w')

        # Save text to outfile.
        blocks = speeches[speaker]
        for i in range(0, len(blocks), 3):
            print >> outfile, blocks[i]
            print >> outfile, blocks[i + 1]
            print >> outfile, blocks[i + 2]
            print >> outfile

        outfile.close()

    print '# of blocks: ' + str(num_blocks)
    print 'Mean # of tokens (per block): ' + str(scipy.mean(block_lengths))
    print 'Median # of tokens: ' + str(scipy.median(block_lengths))
    print 'Standard deviation in # of tokens: ' + str(scipy.std(block_lengths))
コード例 #10
0
def gather_input():

    #gather input
    for file in os.listdir("../scrapper/"):
        if file.endswith(".txt"):
            inputFile = file
    file = open("../scrapper/"+inputFile,"r")
    input = file.read()
    file.close()

    #os.remove("../scrapper/"+inputFile)

    #extract text
    reg_string=ur"\"text\":\"(.+?)[^\\]\""
    data_array=re.findall(reg_string,input)
    
    #extract location of tweet
    reg_string=ur"\"location\":\"(.*?)\""
    location_array=re.findall(reg_string,input)
        
    #extract whether retweeted or not
    reg_string = ur"\"retweeted\":(.+?),"
    retweet_bool=re.findall(reg_string,input)
    
    #today's date in YYYYMMDD format
    date = datetime.datetime.now()
    date = date.date()
    #date = date.strftime("%Y%m%d")

    ## calcualte the barrier date
    date_diff = int(sys.argv[1])
    DD = datetime.timedelta(days=date_diff)
    barrier_date = (datetime.datetime.now()- DD).date()

    ## load the whitelist and create array of arrays as - [noun,sentiment,count]
    file = open("../py_code/white_list.txt","r")
    white_list = []
    line = file.readline()
    while line:
        white_list.append([line.rstrip(),0,0])
        line = file.readline()
    file.close()

    ## create a sentence_tokenizer
    from nltk.tokenize.punkt import PunktSentenceTokenizer, PunktParameters
    punkt_param = PunktParameters()
    punkt_param.abbrev_types = set(['dr', 'vs', 'mr', 'mrs', 'prof', 'inc','1','2','3','4','5','6','7','8','9','10','11','12','13','14','15','16','17','18','19','20'])
    sent_tokenizer = PunktSentenceTokenizer(punkt_param)
    
    ## next step is to inject into the database
    db = MySQLdb.connect(host="localhost",user="******",passwd="{2qGq(22+5iU",db="Insights")
    cur = db.cursor()
    
    ##filter out those tweet which have prices in them - usually sales, or retweets
    i=0
    for text in data_array:
        if retweet_bool[i]!="false":
            pass
        else:        
            ##filter text as many users dont put space after full stop - which is essential to use sentence tokenizer
            data_array[i] = re.sub(r'([\.\?\!])(\w)', r'\1 \2', data_array[i])
            
            blob = TextBlob(data_array[i])
            blob_sentiment = int(blob.sentiment.polarity*1000)/1000.0
            sql = "INSERT INTO Phrases(Phrase,Sentiment,Location,Date) VALUES (\""+data_array[i]+"\", "+str(blob_sentiment)+", \""+location_array[i]+"\", \""+str(date)+"\")"
            cur.execute(sql)

            ## tokenize the tweets, for sentiment analysis
            sentences = sent_tokenizer.tokenize(data_array[i])

            if len(sentences) == 1:
                ##run through the whiteList array, for each find count, add count, sentiment to array
                for word in white_list:
                    if((sentences[0].lower()).find(word[0])!=-1):
                        word[1]=word[1]+blob_sentiment
                        word[2]=word[2]+1

                        
            else:
                for sentence in sentences:
                    ##run through the whiteList array, for each find count and sentiment, add count, sentiment to array
                    for word in white_list:
                        if((sentence.lower()).find(word[0])!=-1):
                            blob = TextBlob(sentence)
                            word[1]=word[1]+int(blob.sentiment.polarity*1000)/1000.0
                            word[2]=word[2]+1
                            
                           
        i=i+1
    db.commit()

    ### now integerate these into Sentiment db, if there is no entry for today insert phrase and create one
    sql = "SELECT * FROM Sentiment WHERE `Date` ='"+str(date)+"' LIMIT 1;"
    cur.execute(sql)
    if(cur.rowcount==0):
        for word in white_list:
            if(word[2]!=0):
                sql = "INSERT INTO Sentiment VALUES ('"+str(date)+"','"+word[0]+"','"+str(word[1])+"','"+str(word[2])+"');"
                cur.execute(sql)
                
    ### else get the entry in the table, add sentiment and count, store back
    else:
        for word in white_list:
            if(word[2]!=0):
                sql = "SELECT Sentiment,Count FROM Sentiment WHERE `Date` ='"+str(date)+"'AND `Phrase`='"+word[0]+"';"
                cur.execute(sql)
                for row in cur.fetchall():
                    new_sentiment = float(row[0])+word[1]
                    new_count = row[1]+word[2]
                sql = "UPDATE Sentiment SET `Sentiment`="+str(new_sentiment)+",`Count`="+str(new_count)+" WHERE `Date` ='"+str(date)+"'AND `Phrase`='"+word[0]+"';"
                cur.execute(sql)
    db.commit()
    
    ### now add all the sentiment and count for all phrases in the white list in the Sentiment db above the barrier_date, add to json those whose count is not zero
    total_sentiment = 0;
    total_count = 0;
    json_array = [];
    for word in white_list:
        sql = "SELECT Sentiment,Count FROM Sentiment WHERE `Date` >'"+str(barrier_date)+"'AND `Phrase`='"+word[0]+"';"
        cur.execute(sql)
        if(cur.rowcount!=0):
            for row in cur.fetchall():
                total_sentiment = total_sentiment+float(row[0])
                total_count = total_count+int(row[1])
            json_array.append({"noun": word[0], "sentiment": int(total_sentiment/total_count*1000)/1000.0, "count": total_count})
            total_sentiment = 0;
            total_count = 0;
    
    db.close()   
    print(json.dumps(json_array))
コード例 #11
0
def process_text_list(seg, text_list, new_json, zone):
    for text_part in text_list:
        if "text" in text_part:
            the_sentences = seg.segment(text_part["text"])
            sentences = []
            for the_span in the_sentences:
                span = {}
                span["start"] = the_span.start
                span["end"] = the_span.end
                sentences.append(span)
            # check if result is acceptable
            valid_segmentation = validate_segmentation(sentences)
            if not valid_segmentation:
                # fall back to NLTK
                sentences = []
                for start, end in PunktSentenceTokenizer().span_tokenize(text_part["text"]):
                    span = {}
                    span["start"] = start
                    span["end"] = end
                    sentences.append(span)
            offset_pos = 0
            # the following is to cancel a sentence segmentation because it is located in the middle of an existing span
            # if previous_start is -1, previous segmentation was correct
            previous_start = -1
            for span in sentences:
                if previous_start != -1:
                    span["start"] = previous_start
                    previous_start = -1

                offset_pos = span["start"]
                sentence_structure = OrderedDict()

                sentence_structure["text"] = text_part["text"][span["start"]:span["end"]]
                
                if "section" in text_part:
                    sentence_structure["section"] = text_part["section"]
                if "paragraph_rank" in text_part:
                    sentence_structure["paragraph_rank"] = text_part["paragraph_rank"]
                if "section_rank" in text_part:
                    sentence_structure["section_rank"] = text_part["section_rank"]
                
                if "ref_spans" in text_part:
                    new_ref_spans = []
                    for ref_span in text_part["ref_spans"]:
                        # check if we have a segmentation in the middle of a ref span
                        if ref_span["start"] >= offset_pos and ref_span["start"] < span["end"] and ref_span["end"] > span["end"]:
                            """
                            print("\nwarning, segmentation in the middle of ref span: sentence at", 
                                span["start"], span["end"], "with ref at", ref_span["start"], ref_span["end"])
                            print("sentence:", text_part["text"][span["start"]:span["end"]])
                            print("ref:", text_part["text"][ref_span["start"]:ref_span["end"]])
                            print("\n")
                            """
                            # in this case, we cancel this sentence boundary
                            previous_start = span["start"]
                            break

                        if ref_span["start"] >= offset_pos and ref_span["end"] <= span["end"]:
                            new_ref_span = OrderedDict()
                            new_ref_span["start"] = ref_span["start"] - offset_pos
                            new_ref_span["end"] = ref_span["end"] - offset_pos
                            if "type" in ref_span:
                                new_ref_span["type"] = ref_span["type"]
                            if "ref_id" in ref_span:
                                new_ref_span["ref_id"] = ref_span["ref_id"]
                            if "text" in ref_span:
                                new_ref_span["text"] = ref_span["text"]
                            new_ref_spans.append(new_ref_span)
                    if len(new_ref_spans) > 0 and previous_start == -1:
                        sentence_structure["ref_spans"] = new_ref_spans

                if "entity_spans" in text_part and previous_start == -1:
                    new_entity_spans = []
                    for entity_span in text_part["entity_spans"]:
                        # check if we have a segmentation in the middle of an entity span
                        if entity_span["start"] >= offset_pos and entity_span["start"] < span["end"]  and entity_span["end"] > span["end"]:
                            """
                            print("\nwarning, segmentation in the middle of entity span: sentence at", 
                                span["start"], span["end"], "with entity at", entity_span["start"], entity_span["end"])
                            print("sentence:", text_part["text"][span["start"]:span["end"])
                            print("entity:", text_part["text"][entity_span["start"]:entity_span["end"]])
                            print("\n")
                            """
                            # in this case, we cancel this sentence boundary
                            previous_start = span["start"]
                            break

                        if entity_span["start"] >= offset_pos and entity_span["end"] <= span["end"]:
                            new_entity_span = OrderedDict()
                            new_entity_span["start"] = entity_span["start"] - offset_pos
                            new_entity_span["end"] = entity_span["end"] - offset_pos
                            if "type" in entity_span:
                                new_entity_span["type"] = entity_span["type"] 
                            if "rawForm" in entity_span:
                                new_entity_span["rawForm"] = entity_span["rawForm"]
                            if "resp" in entity_span:
                                new_entity_span["resp"] = entity_span["resp"]
                            if "used" in entity_span:
                                new_entity_span["used"] = entity_span["used"]
                            if "id" in entity_span:
                                new_entity_span["id"] = entity_span["id"]
                            if "cert" in entity_span:
                                new_entity_span["cert"] = entity_span["cert"]
                            new_entity_spans.append(new_entity_span)
                    if len(new_entity_spans) > 0 and previous_start == -1:
                        sentence_structure["entity_spans"] = new_entity_spans

                if previous_start == -1:
                    new_json[zone].append(sentence_structure)
コード例 #12
0
 def _finalize_sent_tokenizer(self):
     """Re-instantiate sentence tokenizer to ensure has updated params."""
     self._sent_tokenizer = PunktSentenceTokenizer(
         self._sent_trainer.get_params())
コード例 #13
0
from nltk.tokenize.punkt import PunktSentenceTokenizer
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
import networkx as nx
import pylab as plt

open_toi = open('/Users/aj/Documents/TOI_data3.txt',
                encoding='utf-8',
                mode='r+')
read_toi = open_toi.read()

#Tokenzation
sen_token = PunktSentenceTokenizer()
tokens = sen_token.tokenize(read_toi)

#TF-IDF
matrix = CountVectorizer(stop_words=None).fit_transform(tokens)
print("transform matrix:\n")
print(matrix)
norm = TfidfTransformer().fit_transform(matrix)

print("normalized:\n", norm)
print("\n")
print("normalizer.T:\n", norm.T)
print("\n")

#similarity between sentences
similarity = norm * norm.T
print("similarity graph:\n")
print(similarity)
print("similarity Matrix:\n")
print(similarity.toarray())
コード例 #14
0
def performSentenceSegmentation(file_content):
    #Training the model using given text: unsupervised learning
    tokenizer = PunktSentenceTokenizer()
    tokenizer.train(file_content)
    sentence_segmentation = tokenizer.tokenize(file_content)
    return sentence_segmentation
コード例 #15
0
    def summarize(self):
        sents = []
        sentence_tags_dict = {}
        sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
        sentence_tokenizer = PunktSentenceTokenizer()
        for document in self.documents:
            for sent in sent_detector.sentences_from_text(document):
                np_extractor = NPExtractor(sent)
                result = np_extractor.extract()
                index = len(sents)
                for tag in result:
                    # print tag
                    tag = tag.lower()
                    # print tag
                    if tag in sentence_tags_dict.keys():
                        value = sentence_tags_dict[tag]
                        # print "Found", tag, value
                        value.append(index)
                        sentence_tags_dict[tag] = value
                    # else:
                    # print value
                    else:
                        sentence_tags_dict[tag] = [
                            index,
                        ]
                        # print "Set", tag, sentence_tags_dict[tag]
                # print "This sentence is about: %s" % ", ".join(result)
                sents.append(sent)

        cv = CountVectorizer()
        bow_matrix = cv.fit_transform(sents)
        features = cv.get_feature_names()

        selected_sents = set()
        for feature in features:
            # print feature
            if feature in sentence_tags_dict.keys():
                # print "FOUND FEATURE"
                # print feature
                if sentence_tags_dict[feature]:
                    for index in sentence_tags_dict[feature]:
                        selected_sents.add(sents[index])
            else:
                pass

        # print len(sents)
        # print len(selected_sents)
        # print "Documents", len(self.documents)
        # print "Cosine Similarity"
        # self.cosine_similarity(sents)

        # print "\n\nAll Sentences Summary\n\n"
        # self.generate_summary(sents)

        # print "\n\nSelected Sentences\n"
        summaries, removed_sentences = self.generate_summary(sents)

        # self.document_summaries(summaries)
        # print removed_sentences

        return self.document_summaries(summaries)
コード例 #16
0
 def tokenize_all(doc):
     # insert the check statement here
     sentence_tokenizer = PunktSentenceTokenizer()
     sentences = sentence_tokenizer.tokenize(doc)
     return sentences
コード例 #17
0
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
# from nltk.tokenize import PunktWordTokenizer
from nltk.tokenize.punkt import PunktSentenceTokenizer
from nltk.tokenize import WordPunctTokenizer

text = ("Are you curious about tokenization? " + "Let's see how it works! " +
        "We need to analyze a couple of " +
        "sentences with punctuations to see it in action.")

sent_tokenize_list = sent_tokenize(text)
print("Sentence tokenizer:")
print(sent_tokenize_list)

print("Word tokenizer:")
print(word_tokenize(text))

# Create a new punkt word tokenizer

punkt_sent_tokenizer = PunktSentenceTokenizer()
print("Punkt word tokenizer:")
print(punkt_sent_tokenizer.tokenize(text))

word_punct_tokenizer = WordPunctTokenizer()
print("Word punct tokenizer:")
print(word_punct_tokenizer.tokenize(text))
コード例 #18
0
    def _identify_keywords(self, key_terms=None):
        sent_tokenizer = PunktSentenceTokenizer()
        reflection_sentences = sent_tokenizer.tokenize(self.reflection)

        word_tokenizer = nltk.tokenize.word_tokenize

        for sentence in reflection_sentences:

            sent = word_tokenizer(sentence.lower())

            if key_terms is None:

                tokenized_sentence = [(x[0], util.clean_word(x[1]))
                                      for x in enumerate(sent)]

                relevant_data = Data.adjectives + Data.substantives

                result_obj = {'sentence': sentence, 'terms': []}

                terms_found = self._find_terms(tokenized_sentence,
                                               relevant_data)
                if terms_found:
                    result_obj['terms'].append(terms_found)
                    self.results.append(result_obj)

            else:
                for key_term in key_terms:

                    # make an enumerable list, so I have the index of the splitted words changing the enumerable object to list

                    # esse é para o caso de existir o mesmo verbo mais de uma vez na mesma frase
                    key_term_indexes = [(i, v) for i, v in tokenized_sentence
                                        if v == key_term['word']]

                    if not key_term_indexes:
                        continue

                    for key_term_index, t in key_term_indexes:

                        result_obj = {
                            'sentence': sentence,
                            'terms': [key_term]
                        }

                        # offset para pesquisar por outros elementos ao redor dos verbos
                        offset = 5

                        # surr stands for surrounding
                        surr_words = [
                            (index, word) for index, word in
                            tokenized_sentence[max(0, key_term_index -
                                                   offset):key_term_index +
                                               offset]
                        ]
                        cleaned_surr_words = [(index, util.clean_word(word))
                                              for index, word in surr_words]

                        for terms in self._find_terms(surr_words,
                                                      Data.substantives,
                                                      clean=False):
                            result_obj['terms'].append(terms)

                        for terms in self._find_terms(cleaned_surr_words,
                                                      Data.adjectives):
                            result_obj['terms'].append(terms)

                        # if we haven't found any of the substantives or adjectives, we shoudn't look for adverbs

                        if [
                                x for x in result_obj['terms']
                                if x['type'] != 'verb'
                                and x['type'] != 'personal_pronoun'
                        ]:
                            adverbs = Data.adverbs
                            result = self._find_terms(cleaned_surr_words,
                                                      Data.adverbs)
                            for terms in result:
                                result_obj['terms'].append(terms)

                            self.results.append(result_obj)
コード例 #19
0
 def __init__(self):
     super(GCBlockExtractor,
           self).__init__(extraction_function=self._blocks_from_text)
     self.tokenizer = PunktSentenceTokenizer()
コード例 #20
0
def annotate_text(raw_data_folder,
                  labels_data_folder,
                  file_to_write,
                  max_sent_len=35,
                  improved_sent_splitting=True,
                  training=True):
    """
    Creates a token-level input file for the span identification task and adds
    sentence IDs to the tokens.
    """
    # max_sent_len = -1 ==> no sentence splitting
    if max_sent_len == -1:
        # the corresponding if-block can handle this
        improved_sent_splitting = True
    nlp = English()
    tokenizer = nlp.Defaults.create_tokenizer(nlp)
    if improved_sent_splitting:
        punkt_param = PunktParameters()
        punkt_param.abbrev_types = set([
            'dr', 'vs', 'mr', 'mrs', 'prof', 'inc', 'ms', 'rep', 'u.s', 'feb',
            'sen'
        ])
        splitter = PunktSentenceTokenizer(punkt_param)
        splitter.PUNCTUATION = tuple(';:,.!?"')
    output_table = []
    file_counter = 0
    sent_no_total = 0

    print("Total number of files - {}".format(len(
        os.listdir(raw_data_folder))))

    # Reading all the files from the raw text directory
    article_file_names = [
        file_name for file_name in os.listdir(raw_data_folder)
        if file_name.endswith(".txt")
    ]
    article_file_names.sort()

    for file_name in article_file_names:
        if training:
            label_file_name = file_name.replace(".txt", ".task2-TC.labels")
            print("raw_article: {}\tlabel_file: {}".format(
                file_name, label_file_name))

            # Read the labels file with 4 columns of format
            # doc_id : label_of_span : idx_span_begin : idx_span_end
            with open(os.path.join(labels_data_folder, label_file_name),
                      encoding="utf-8") as file:
                rows = file.readlines()
                rows = [
                    row.strip().split("\t") for row in rows
                    if len(row.split("\t")) == 4
                ]

                # Saving mappings char_idx->labels into the dictionary
                char_idx2label = dict()
                for row in rows:
                    label = row[1]
                    idx_from = int(row[2])
                    idx_to = int(row[3])

                    for idx in range(idx_from, idx_to):
                        if idx not in char_idx2label.keys():
                            char_idx2label[idx] = []
                        char_idx2label[idx].append(label)
        else:
            print("raw_article: " + file_name)

        # Read the article and process the text
        with open(os.path.join(raw_data_folder, file_name),
                  encoding="utf-8") as file:
            file_text = file.readlines()
            # Keep linebreaks for better sentence splitting
            file_text = ''.join([line for line in file_text])

            # Normalizing punctuation marks to help the tokenizer.
            file_text = file_text.replace('“', '"').replace('”', '"')
            file_text = file_text.replace("’", "'").replace("‘", "'")

            sentences = []
            if improved_sent_splitting:
                # Line breaks -> helps with headlines
                paragraphs = file_text.split('\n')
                for para in paragraphs:
                    para = para.strip()
                    sentences_raw = splitter.sentences_from_text(para)
                    for sent in sentences_raw:
                        sent = sent.strip()
                        tokens = tokenizer(sent)
                        if len(tokens) <= max_sent_len or max_sent_len == -1:
                            # No need to split the sentence!
                            if len(sent) == 0:
                                # Can happen when paragraphs are separated by
                                # several line breaks.
                                continue
                            sentences.append(sent)
                            continue

                        # Try splitting based on quotes.
                        quote_fragments, all_ok = punct_based_split_sent(
                            tokenizer, sent, max_sent_len, '"')
                        if all_ok:
                            sentences += quote_fragments
                            continue

                        # Other punctuation for splitting: ; :
                        for quote_frag in quote_fragments:
                            semicolon_fragments, all_ok =\
                                punct_based_split_sent(tokenizer, quote_frag,
                                                       max_sent_len, ';')
                            if all_ok:
                                sentences += semicolon_fragments
                                continue

                            for semicolon_frag in semicolon_fragments:
                                colon_fragments, all_ok =\
                                    punct_based_split_sent(tokenizer,
                                                           semicolon_frag,
                                                           max_sent_len, ':')
                                if all_ok:
                                    sentences += colon_fragments
                                    continue

                                # Commas:
                                for col_frag in colon_fragments:
                                    comma_fragments, all_ok =\
                                        punct_based_split_sent(tokenizer,
                                                               col_frag,
                                                               max_sent_len,
                                                               ',')
                                    if all_ok:
                                        sentences += comma_fragments
                                        continue

                                    # Last resort:
                                    # Split after max_sent_len tokens
                                    for comma_frag in comma_fragments:
                                        sentences += forcefully_split_sent(
                                            tokenizer, comma_frag,
                                            max_sent_len)
            else:
                # Cut long sentences into fragments that are (up to)
                # max_sent_len characters long
                # (the last fragment in a sentence might be shorter)
                file_text = file_text.replace('\n', ' ')
                sentences_raw = sent_tokenize(file_text)
                for sent in sentences_raw:
                    sentences += forcefully_split_sent(tokenizer, sent,
                                                       max_sent_len)

            i = 0
            for sent in sentences:
                sent = sent.strip()
                i = file_text.find(sent, i)
                max_idx = i + len(sent)

                if sent == '':
                    continue

                if improved_sent_splitting:
                    if len(sent.strip()) < 2:  # single char noise
                        continue

                sent_no_total += 1
                for token in tokenizer(sent):
                    token = str(token)
                    token_idx = file_text.find(token, i, max_idx)
                    i = token_idx + len(token)
                    output = [
                        file_name.replace("article", "").replace(".txt", ""),
                        str(sent_no_total),
                        str(token_idx),
                        str(i), token
                    ]
                    if training:
                        # Check the label of the corresponding char_idx
                        label = char_idx2label.get(token_idx, ['None'])
                        output.append("|".join(label))
                    output_table.append(output)

        file_counter += 1
        print("Finished {} files\n".format(file_counter))

        with open(file_to_write, 'w', encoding="utf-8") as f:
            f.write('# max_sent_len=' + str(max_sent_len) +
                    ', improved_sent_splitting=' +
                    str(improved_sent_splitting) + '\n')
            f.write('document_id\tsent_id\ttoken_start\ttoken_end\ttoken')
            if training:
                f.write('\tlabel')
            f.write('\n')
            for row in output_table:
                f.write('\t'.join(row) + "\n")
コード例 #21
0
ファイル: apptextsum.py プロジェクト: manHax/uas
    def ringkasan(self):
        array_text = []
        text = ' '.join(
            re.sub("(@[A-Za-z1-9]+)|(\w+:\/\/\S+)", " ",
                   self.inputTeks).split())
        text = re.sub('<[^>]*>', '', text)  #menghilangkan tanda baca
        #text = re.sub("\d+", "", inputTeks) #menghilangkan angka
        emoticons = re.findall('(?::|;|=)()(?:-)?(?:\)|\(|D|P)',
                               text)  #menghilangkan emoticon
        #text = (re.sub('[\W]+', ' ', text.lower()) + ' '.join(emoticons).replace('-', '')) #membuat semua huruf menjadi lower case
        array_text.append(text)

        for teks in array_text:
            document = teks

        doc_tokenizer = PunktSentenceTokenizer()
        sentences_list = doc_tokenizer.tokenize(document)

        cv = CountVectorizer()
        cv_matrix = cv.fit_transform(sentences_list)
        a = cv_matrix.toarray()

        normal_matrix = TfidfTransformer().fit_transform(cv_matrix)
        tfidf = normal_matrix.toarray()

        vektorkalimat = normal_matrix.toarray()
        A = vektorkalimat[0]
        B = vektorkalimat[2]
        dot = np.dot(A, B)
        norma = np.linalg.norm(A)
        normb = np.linalg.norm(B)
        cos = dot / (norma * normb)

        A = normal_matrix.T

        res_graph = normal_matrix * normal_matrix.T  # similaritas /adjacency matrix
        G = res_graph.toarray()

        G = nx.from_numpy_matrix(np.matrix(G), create_using=nx.DiGraph)

        nx_graph = nx.from_scipy_sparse_matrix(res_graph)

        pageranks = nx.pagerank(nx_graph)
        sentence_array = sorted(
            ((pageranks[i], s) for i, s in enumerate(sentences_list)),
            reverse=True)
        sentence_array = np.asarray(sentence_array)

        rank_max = float(sentence_array[0][0])
        rank_min = float(sentence_array[len(sentence_array) - 1][0])

        temp_array = []

        # Jika semua rank sama
        # taking any sentence will give the summary, say the first sentence
        flag = 0
        if rank_max - rank_min == 0:
            temp_array.append(0)
            flag = 1

        # If the sentence has different ranks
        if flag != 1:
            for i in range(0, len(sentence_array)):
                temp_array.append((float(sentence_array[i][0]) - rank_min) /
                                  (rank_max - rank_min))

        print(len(temp_array))
        print(temp_array)
        print(sentence_array[4], [0])

        threshold = (sum(temp_array) / len(temp_array))

        sentence_list = []
        if len(temp_array) > 1:
            for i in range(0, len(temp_array)):
                if temp_array[i] > threshold:
                    sentence_list.append(sentence_array[i][1])
        else:
            sentence_list.append(sentence_array[0][1])

        summary = " ".join(str(x) for x in sentence_list)
        print(summary)
        # save the data in another file, names sum.txt
        if self.modesum == "web":
            namaf = 'ringkasan_url.txt'
        else:
            namaf = 'ringkasan_file.txt'
        f = open(namaf, 'w+')
        #print(type(f))
        f.write("\n")
        f.write(summary)
        f.close()

        self.akhir = summary
コード例 #22
0
def tokenizeText2(text):  #for abbrevations
    punkt_param = PunktParameters()
    abbreviation = abbrevations
    punkt_param.abbrev_types = set(abbreviation)
    tokenizer = PunktSentenceTokenizer(punkt_param)
    return [sent for sent in tokenizer.tokenize(text) if sent not in symbols]
	def __init__(self,text):
		self.text=text
		self.text = ' '.join(self.text.strip().split('\n'))
		self.sentence_splitter = PunktSentenceTokenizer()
		self.sentences = self.sentence_splitter.tokenize(text)
コード例 #24
0
ファイル: nlp.py プロジェクト: L0m/sentence-similarity
def get_sentence_spans(text: str):
    for start, end in PunktSentenceTokenizer().span_tokenize(text):
        yield start, end
コード例 #25
0
                total_correct += 1
                true_predictions.append(p)

    print(
        f'\n\n\nAcc: {total_correct / len(model_predictions):.7f} ' +
        f'F1: {f1_score(true_labels, model_predictions, average="macro"):.7f} ' +
        f'F1 by classes: {" ".join(
            str(f1) for f1 in f1_score(true_labels, model_predictions, average=None).tolist())}' +
        f'Total correct {total_correct} out of {len(model_predictions)}' +
        f'Correct by classes: {[true_predictions.count(c) for c in list(range(num_classes))]} /' +
        f'{[true_labels.count(c) for c in list(range(num_classes))]}\n'
    )


bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
sentence_tokenizer = PunktSentenceTokenizer()


def bert_tokenize(text):
    sentences = sentence_tokenizer.tokenize(text)
    text = ''
    for sentence in sentences:
        text += sentence + ' [SEP] '
    tokens = []
    tokens.append("[CLS]")
    tokens += bert_tokenizer.tokenize(text)
    return tokens


if __name__ == '__main__':
    with open('config.json', 'r') as f:
コード例 #26
0
ファイル: sentenceTokenizer.py プロジェクト: h2r/slu_core
 def __init__(self):
     self.tokenizer = PunktSentenceTokenizer()
コード例 #27
0
        return 'Low'
    elif x < 255 / 1000:
        return 'A1'
    elif x < 550 / 1000:
        return 'A2'
    elif x < 785 / 1000:
        return 'B1'
    else:
        return 'B2+'


class BulletPointLangVars(PunktLanguageVars):
    sent_end_chars = ('.', '?', '!', '•', '...', '|')


SENT_TOKENIZER = PunktSentenceTokenizer(lang_vars=BulletPointLangVars())
TEMPLATE_POSTER_URL = 'https://s.studiobinder.com/wp-content/uploads/2017/12/Movie-Poster-Template-Light-With-Image.jpg?x81279'
IMDB = pd.read_csv(processed_data_dir / 'movie_details_db.csv',
                   dtype={'id': str})


class Movie():
    ''' A movie in the Movielingo app

    Attributes:
    - title (str)
    - IMDB ID (str)
    - link to movie poster (str)
    - subtitle features (pandas df with NLP features)
    - subtitle difficulty distribution (list)
    - IMDB page (BeautifulSoup)
コード例 #28
0
import os
import re
from nltk.tokenize.punkt import PunktSentenceTokenizer, PunktTrainer
from nltk.corpus import gutenberg

#This file add possible abreviation into a txt file for nltk to learn in "process.py"

#Put in the path of the directory containing all txt files (crawled data)
directory = 'C:\\Users\\hyzha\\PycharmProjects\\NY Times World\\NY Times World'
text = ""
for file_id in gutenberg.fileids():
    text += gutenberg.raw(file_id)
trainer = PunktTrainer()
trainer.INCLUDE_ALL_COLLOCS = True
trainer.train(text)
tokenizer = PunktSentenceTokenizer(trainer.get_params())
abr = []

n = 1
for filename in os.listdir(directory):
    filepath = directory + '\\' + filename
    if 'text.txt' in filepath:
        try:
            with open('new.txt', 'w') as new:
                with open(filepath, 'r', encoding='utf-8',
                          errors='ignore') as f:
                    print(n)
                    n += 1
                    i = 0
                    for line in f:
                        i += 1
コード例 #29
0
    'a', 'a.k.', 'pr', 'm', 'e', 'a.s', 'adv', 'akad', 'aklg', 'akt', 'al',
    'angl', 'apyg', 'aps', 'apskr', 'asist', 'asmv', 'avd', 'atsak', 'aut',
    'biol', 'b.k', 'bkl', 'bot', 'bt', 'buv', 'chem', 'd', 'dab', 'dail',
    'dek', 'dėst', 'dir', 'dirig', 'doc', 'dr', 'drp', 'dš', 'e.p', 'el.p',
    'egz', 'eil', 'ekon', 'el', 'e', 'etc', 'ež', 'fak', 'faks', 'filol',
    'filos', 'g', 'gyv', 'G', 'gen', 'geol', 'gerb', 'gim', 'gyd', 'gv', 'įl',
    'Įn', 'insp', 'inž', 'pan', 't.t', 'istor', 'k', 'Em.', 'k.a', 'kand',
    'kat', 'kg', 'kyš', 'kl', 'kln', 'kn', 'koresp', 'kpt', 'kr', 'kt', 'kun',
    'l.e.p', 'liet', 'ltn', 'mat', 'med', 'mėn', 'mgr', 'mgnt', 'min', 'mjr',
    'mln', 'mlrd', 'mok', 'mst', 'mstl', 'N', 'nkt', 'ntk', 'nr', 'p', 'p.d',
    'p.m.e', 'pav', 'pavad', 'pirm', 'pl', 'plg', 'plk', 'pr.kr', 'proc',
    'prof', 'prok', 'prot', 'pss', 'pšt', 'pvz', 'r', 'red', 'rš', 'raj', 's',
    'sąs', 'sav', 'saviv', 'sekr', 'sek', 'sen', 'sk', 'skg', 'skyr', 'skv',
    'sp', 'spec', 'sr', 'st', 'str', 'stud', 'š.m', 'šnek', 'šv', 't', 't.y',
    't.p', 'techn', 'tel', 'teol', 'tir', 'tūkst', 'tūkstm', 'up', 'upl', 'V',
    'vad', 'val', 'ved', 'vet', 'vnt', 'vrš', 'vyr', 'vyresn', 'vs', 'Vt',
    'vtv', 'vv', 'zool', 'žml', 'žr', 'ž.ū', 'šmt'
]
punkt_param.abbrev_types = set(abbreviation)
tokenizer = PunktSentenceTokenizer(punkt_param)

with open(input_file) as ifile:
    with open(output_file, "w") as ofile:
        for i, line in tqdm(enumerate(ifile)):
            if line != "\n":
                # sent_list = nltk.tokenize.sent_tokenize(line)
                sent_list = tokenizer.tokenize(line)
                for sent in sent_list:
                    ofile.write(sent + "\n")
                ofile.write(doc_seperator)
コード例 #30
0
ファイル: named_ent_chunker.py プロジェクト: jhave/DS_HK_2
from nltk import ne_chunk,pos_tag
from nltk.tokenize.punkt import PunktSentenceTokenizer
from nltk.tokenize.treebank import TreebankWordTokenizer
'''
	import nltk
	nltk.download('words')
	nltk.download('punkt')
	nltk.download('maxent_treebank_pos_tagger')
	nltk.download('maxent_ne_chunker')
'''


TreeBankTokenizer = TreebankWordTokenizer()
PunktTokenizer = PunktSentenceTokenizer()
text = '''
The Boston Celtics are a National Basketball Association (NBA) team based in Boston, MA. They play in the Atlantic Division
 of the Eastern Conference. Founded in 1946, the team is currently owned by 
 Boston Basketball Partners LLC. The Celtics play their home games at the TD Garden,
 which they share with the Boston Blazers (NLL), and the Boston Bruins of the NHL.
 
 The Celtics have dominated the league during the late 50's and through the mid 80's, 
 with the help of many Hall of Famers which include Bill Russell, Bob Cousy, John Havlicek, 
 Larry Bird and legendary Celtics coach Red Auerbach, 
 combined for a 795 - 397 record that helped the Celtics win 16 Championships.
'''

sentences = PunktTokenizer.tokenize(text)
tokens = [TreeBankTokenizer.tokenize(sentence) for sentence in sentences]
tagged = [pos_tag(token) for token in tokens]
chunked = [ne_chunk(taggedToken) for taggedToken in tagged]