Ejemplo n.º 1
0
    def myTokenizer(self, txt):
        """
		Constructing a tokenizer based on regular expressions
		@params txt: the words of the text as a string
		@rtype: {List}
		"""
        return re.findall(r"\w+(?:[-']\w+)*|'|[-.(]+|\S\w*", txt)
Ejemplo n.º 2
0
def findCurrency(text):
    """ Display information about the found strings """

    symbols = "$£eurospoundsdollars"

    # Iterate through each item in text and find all strings matching a regular
    # expression to find all amounts of money
    for i in text:
        matches = re.findall('((?:(?:\$|£)(?:\d+)(?:\.?\d*,?\d{1,3})(?:bn|m)?)|'\
        '(?:(?:\d+)(?:\.?,?\d)*(?:bn|m)?(?: ?euros?| ?dollars?| ?pounds?| ?p)))',\
         i, re.IGNORECASE)

        # If a match is found, check the currency and amount, print
        if matches:
            for m in matches:
                if re.search('\$|dollars?', m, re.IGNORECASE):
                    currency = "Dollar"
                if re.search('\£|pounds?|p', m, re.IGNORECASE):
                    currency = "Pound"
                if re.search('euros?', m, re.IGNORECASE): currency = "Euro"

                amount = m.strip(symbols)

                print("Found a match!" + "\nCurrency:", currency, "\nAmount:",\
                 amount, "\n")
Ejemplo n.º 3
0
def sentence2word(inputFile,outputFile):

    with open(inputFile) as dataFile:
        sentences = dataFile.read().splitlines()

    rows = []
    for sentence in sentences:
        row = []
        row.append(sentence); row.append('Sentence'); rows.append(row)
        row = []
        row.append('BOS'); row.append('BOS'); rows.append(row)

        # split sentence into words and punctuations
        words = re.findall(r"[\w']+|[().,!?;]", sentence)
        for word in words:
            row = []
            row.append(word)
            row.append('O')
            rows.append(row)
        row = []
        row.append('EOS'); row.append('EOS'); rows.append(row)

    with open(outputFile,'w') as w:
        writer = csv.writer(w)
        writer.writerows(rows)
    print('Done: sentence text to word+\'O\' csv')
Ejemplo n.º 4
0
def convert_emphesize(text, return_count=False):
    emphs = re.findall(r'\b[A-Z]{2,}\b', text)
    emphs = set(emphs)
    if return_count:
        return len(emphs)
    for emph_ in emphs:
        text = re.sub(r'\b' + emph_ + r'\b', emph_ + ' emphh', text)
    return text
Ejemplo n.º 5
0
def strip_url(text, return_count=False):
    urls = re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', text)
    if return_count:
        return len(urls)
    for url in urls:
        text = text.replace(url, '_URL_')
    text = text.replace('https:', '')
    return text
Ejemplo n.º 6
0
def is_long_number(text, threshold=1, flag_res=False):
    numbers_lens = re.findall('\\d+', text)
    if numbers_lens and len(max(numbers_lens, key=len)) >= threshold:
        if flag_res:
            return len(max(numbers_lens, key=len))
        return text + ' _longnumber_'
    if flag_res:
        return 0
    return text
Ejemplo n.º 7
0
def tokenize(str):

    # remove punctuation
    tokens = re.findall(r"<a.*?/a>|<[^\>]*>|[\w'@#]+", str.lower())

    # lemmatize words. try both noun and verb lemmatizations
    lmtzr = WordNetLemmatizer()
    for i in range(0, len(tokens)):
        res = lmtzr.lemmatize(tokens[i])
        if res == tokens[i]:
            tokens[i] = lmtzr.lemmatize(tokens[i], 'v')
        else:
            tokens[i] = res
    return tokens
Ejemplo n.º 8
0
 def get_score(self, text):
     if self.language == 'es':
         sentences = len(self.sentence_tokenizer.tokenize(text))
         clean_text = self.clean_text(text)
         words = len(clean_text.split())
         syllables = len(re.findall('[aeiou]', clean_text))
         score = 206.835 - (62.3 *
                            (syllables / words)) - (words / sentences)
         return score
     elif self.language == 'en':
         total_words = len(text.split())
         characters = len("".join(self.clean_text(text).split()))
         sentences = len(self.sentence_tokenizer.tokenize(text))
         score = round((4.71 * (characters / total_words)) +
                       (0.5 * (total_words / sentences)) - 21.43)
     return score
Ejemplo n.º 9
0
def stemming(string, top):
    string_rep = re.sub('[^a-zA-Z]', ' ', string)
    string_rep = string_rep.lower()
    new_vocab = word_tokenize(string_rep)
    #inter_stem=[ps.stem(l) for l in new_vocab]
    vocab_stem = [
        ps.stem(w) for w in new_vocab
        if w not in set(stopwords.words('english'))
    ]
    dictry_stem = set(vocab_stem)
    data = ' '.join(vocab_stem)
    arr = []
    c = 0
    t = 0
    for i in dictry_stem:
        c = len(re.findall(i, data))
        t = t + c
        arr.append([i, c])
        #print(i,c)
    aux = []
    for q, w in arr:
        if w > 1:
            aux.append([q, w])

    aux.sort(key=lambda i: i[1], reverse=True)
    aux = [w for w in aux if w[1] > 1]

    x = []
    y = []
    for wrd, co in aux:
        x.append(wrd)
        y.append(co)

    print("total number of words:", t, "\t set of words:",
          len(set(dictry_stem)))
    #print("Top ",top," words occurring more than twice: ",aux)
    print("%age of total length these word account for:", (sum(y) / t) * 100)
    #plt.bar(x,y,color='red',alpha=0.8)
    #plt.xlabel('Word')
    #plt.ylabel('Frequency')
    #plt.title('Words occurring more than twice')
    return (x, string_rep)
Ejemplo n.º 10
0
s2 = '</s>'
Know = vocabRaw_tokens_nopunct[0] + vocabRaw_tokens_nopunct[
    1] + vocabRaw_tokens_nopunct[2]
vocabRaw_tokens_nopunct.append("[^" + Know + "]")
vocabRaw_tokens_nopunct.append(s1)
Px_a = [0.0, 0.0, 0.0, 0.0, 0.0]
Px_b = [0.0, 0.0, 0.0, 0.0, 0.0]
Px_c = [0.0, 0.0, 0.0, 0.0, 0.0]
Px_UNK = [0.0, 0.0, 0.0, 0.0, 0.0]
Px_s = [0.0, 0.0, 0.0, 0.0, 0.0]
Ps_x = [0.0, 0.0, 0.0, 0.0, 0.0]

for i in range(0, 5):
    Px_a[i] = len(
        re.findall(
            s1 + ".*" + vocabRaw_tokens_nopunct[0] + " " +
            vocabRaw_tokens_nopunct[i] + ".*" + s2, dataRaw)) / len(
                re.findall(vocabRaw_tokens_nopunct[0], dataRaw))
    Px_b[i] = len(
        re.findall(
            s1 + ".*" + vocabRaw_tokens_nopunct[1] + " " +
            vocabRaw_tokens_nopunct[i] + ".*" + s2, dataRaw)) / len(
                re.findall(vocabRaw_tokens_nopunct[1], dataRaw))
    Px_c[i] = len(
        re.findall(
            s1 + ".*" + vocabRaw_tokens_nopunct[2] + " " +
            vocabRaw_tokens_nopunct[i] + ".*" + s2, dataRaw)) / len(
                re.findall(vocabRaw_tokens_nopunct[2], dataRaw))
    Px_UNK[i] = len(
        re.findall(
            s1 + ".*" + vocabRaw_tokens_nopunct[3] + " " +
Ejemplo n.º 11
0
def horus_to_features(horusfile, le):
    print horusfile
    features, sentence_shape = [], []
    targets, tokens_shape, y_sentences_shape, y_tokens_shape = [], [], [], []

    df = pd.read_csv(horusfile,
                     delimiter=",",
                     skiprows=1,
                     header=None,
                     keep_default_na=False,
                     na_values=['_|_'])
    oldsentid = df.get_values()[0][1]
    for index, linha in df.iterrows():
        if len(linha) > 0:
            if linha[7] == 0:  #no compounds
                if linha[1] != oldsentid:
                    sentence_shape.append(features)
                    y_sentences_shape.append(targets)
                    targets, features = [], []

                idsent = linha[1]
                idtoken = linha[2]
                pos_bef = ''
                pos_aft = ''
                if index > 0 and df.get_value(index - 1, 7) == 0:
                    pos_bef = df.get_value(index - 1, 5)
                if index + 1 < len(df) and df.get_value(index + 1, 7) == 0:
                    pos_aft = df.get_value(index + 1, 5)
                token = linha[3]
                postag = linha[5]
                one_char_token = len(token) == 1
                special_char = len(
                    re.findall('(http://\S+|\S*[^\w\s]\S*)', token)) > 0
                first_capitalized = token[0].isupper()
                capitalized = token.isupper()
                title = token.istitle()
                digit = token.isdigit()
                stop_words = token in stop
                small = True if len(horusfile[3]) <= 2 else False
                stemmer_lanc = lancaster_stemmer.stem(token)
                nr_images_returned = linha[17]
                nr_websites_returned = linha[25]
                hyphen = '-' in token
                cv_loc = float(linha[12])
                cv_org = float(linha[13])
                cv_per = float(linha[14])
                cv_dist = float(linha[15])
                cv_plc = float(linha[16])
                tx_loc = float(linha[20])
                tx_org = float(linha[21])
                tx_per = float(linha[22])
                tx_err = float(linha[23])
                tx_dist = float(linha[24])

                if linha[6] in definitions.NER_TAGS_LOC: ner = u'LOC'
                elif linha[6] in definitions.NER_TAGS_ORG: ner = u'ORG'
                elif linha[6] in definitions.NER_TAGS_PER: ner = u'PER'
                else: ner = u'O'

                #standard shape
                sel_features = [
                    idsent, idtoken, token,
                    token.lower(), stemmer_lanc, pos_bef, postag, pos_aft,
                    definitions.KLASSES2[ner],
                    le.transform(pos_bef),
                    le.transform(postag),
                    le.transform(pos_aft), title, digit, one_char_token,
                    special_char, first_capitalized, hyphen, capitalized,
                    stop_words, small, nr_images_returned,
                    nr_websites_returned, cv_org, cv_loc, cv_per, cv_dist,
                    cv_plc, tx_org, tx_loc, tx_per, tx_dist, tx_err
                ]

                features.append(sel_features)

                if linha[51] in definitions.NER_TAGS_LOC: y = u'LOC'
                elif linha[51] in definitions.NER_TAGS_ORG: y = u'ORG'
                elif linha[51] in definitions.NER_TAGS_PER: y = u'PER'
                else: y = u'O'

                targets.append(y)

                tokens_shape.append(sel_features[9:len(sel_features)])
                y_tokens_shape.append(definitions.KLASSES2[y])

                oldsentid = linha[1]

    print 'total of sentences', len(sentence_shape)
    print 'total of tokens', len(tokens_shape)
    return sentence_shape, y_sentences_shape, tokens_shape, y_tokens_shape
def extract_named_entity(named_entities_tree, entity_type):
    return map(
        lambda result: ' '.join(
            map(lambda inner_result: inner_result.split('/')[0], result[5:]
                [:-1].split())),
        re.findall('\({}.*\)'.format(entity_type), named_entities_tree))
Ejemplo n.º 13
0
# 100 and returned to string format. After the validation the output is provided saying a match is found,
# what the match is, the currency type and the quanity.
#-----------------------------------------------------------------------------------------------------------

import urllib
import nltk
from nltk import re

# Prints presentation text.
print("=" * 80 + "\n PART 2 : REGULAR EXPRESSIONS, FSAs and FSTs \n" + "=" * 80)

# Add the website source and scrape the contents of the <p> tags.
print("Loading website 'http://www.bbc.co.uk/news/business-41779341'...")
website = urllib.urlopen('http://www.bbc.co.uk/news/business-41779341').read()
print("Finding text between <p></p> tags...")
pTagText = re.findall('<p>(.*?)</p>',website, flags=re.I)

# The regular expression.
regular_expression = '(?:([€$£])((?:\d\d{0,2})(?:,\d{3})*(?:\.\d+)?(?:k|mn|bn|tn)?))|(?:((?:\d\d{0,2})(?:,\d{3})*(?:\.\d+)?(?:k|mn|bn|tn)?)( ?pence|p| ?euros?| ?dollars?| ?pounds?))'

# Iterates through the text gained from the <p> tags and applies the regular expression on them to search
# for matches, which if found, are added to the results array.
results = []
print("Finding currency related text from website contents...\n")
for p in pTagText:
    results += re.findall(regular_expression, p, flags=re.I)

# If no results are found print so.    
if(len(results) == 0):
    print("No matches found.")
    
import nltk
from nltk import word_tokenize, re, pos_tag
import nltk
from nltk import re

#We ask User to input whether he wants to input text or URL.
choose = input(
    "Please let us know, if you want to search phone numbers from text or URL?"
)
if choose == 'text':
    #If choice =text we input text from user, and store the Regular Expression pattern in reg

    text = input("Please Enter a text:")
    reg = '(((^)|(\s))((\+\d{2}\s\d{2}\s{0,1}\d{8})|([1-9]\d{3}\s\d{6})|((0)((0\d{2}\s\d{2}\s{0,1}\d{8})|([1-9]\d{3}\s{0,1}\d{6})|(\d{2}\s\d{2}\s{0,1}\d{8})))))'

    phone = re.findall(reg, text)
    print(text)
    instate = 'q0'
    if not phone:
        print("Sorry No MatchesFound")
    else:
        for i in range(len(phone)):
            print("Match Found:", phone[i][0], "at:", i)

#Getting PhoneNumbers from URL:
#If user wishes to enter URL, we ask user to enter URL

elif choose == "URL":
    url = input("Please Enter the URL in full standard format:")
    #We use urlopen to open that URL, and we decode and store the URL value in rw in html format
    response = request.urlopen(url)
Ejemplo n.º 15
0
def tokens(text):
    return re.findall('[a-z]+', text.lower())
Ejemplo n.º 16
0
f=open("SimLex999-100.txt","r")
lines=f.readlines()
#lines=str(lines)
result=[]
from nltk import re
lis1=[]
for i in range(0,len(lines)) :
    lis1=lis1+lines[i].split('\t')
lis1
strtry=str(lis1)
strtry=" ".join(re.findall("[a-zA-Z]{2,}", strtry))
#print(strtry)
las=strtry.split(' ')
las
sent1 = list()
sent2 = list()
index = 0
for letter in las:
    if index % 2 != 0:
        sent2.append(letter)
    else:
        sent1.append(letter)
    index += 1
from nltk.corpus import wordnet
syn_name1 = list()
for i in range(0,len(sent1)):
    syn = wordnet.synsets(sent1[i])[0] 
    syn_name1.append(syn.name())
syn_name1
from nltk.corpus import wordnet
syn_name2 = list()