コード例 #1
0
ファイル: language.py プロジェクト: ChipaKraken/iRnWs
 def check(self, sentence):
     result = dict.fromkeys(self.langs.keys(), 1)
     for word in fa('\w+', sentence):
         temp = '^' + word.lower() + '$'
         while len(temp[:2]) == 2:
             for lang in self.langs.keys():
                 result[lang] *= self.check_word(temp, lang)
             temp = temp[1:]
     return max(result, key=lambda k: result[k])
コード例 #2
0
    def toSymbol(self, inputSen):
        doneCurrencyNames = self.doneCurrencyNames
        for k, v in doneCurrencyNames.items():
            if v['symbol'] is not None:
                if isinstance(v['symbol'],list):
                    symbol = v['symbol'][0]
                else:
                    symbol = v['symbol']
                nounCurrency = s('^.* ','',k,flags=IGNORECASE)                                                                      # fm('[a-z]+', k, flags=IGNORECASE) != None
                if ' ' not in k:                                                                                                # exception: one-word currency: euro, bitcoin
                    nounCurrency = k
                if v['plural'] is not None:
                    pluralPrefix = f'{v["plural"]}|{ud(v["plural"])}|'                                  # orders matter. It's always {plural|singular} rather than {singular|plural}
                else:
                    pluralPrefix = ''
                allInstances = fa(f'[0-9\\,\\.]+ (?:{pluralPrefix}{k}|{ud(k)}|{nounCurrency}|{ud(nounCurrency)})(?:e?s)?', inputSen, flags=IGNORECASE)    # all should be uncaptured. Use (?:) instead of ()   <- captured
                trailingZeroRegex = r"\.0$"
                for i in allInstances:
                    numberPart = fa(f'[0-9\\,\\.]+', i)
                    # Left or right ?
                    if len(numberPart) > 0:
                        if symbol.isalpha():  # Cyrillic and zloty are alphas !
                            # rpmt = f'{numberPart[0]} {symbol}'
                            rpmt = f'{s(trailingZeroRegex,"","{:,}".format(float(numberPart[0].replace(",",""))).replace(","," "))} {symbol}'
                        else:
                            # rpmt = f'{symbol}{numberPart[0]}'
                            rpmt = f'{symbol}{s(trailingZeroRegex,"","{:,}".format(float(numberPart[0].replace(",",""))).replace(","," "))}'
                        inputSen = s(i, rpmt, inputSen, flags=IGNORECASE)

                # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

                if v['fractional']['name'] is not None:
                    nounFraction = v['fractional']['name']
                    allInstances = fa(f'[0-9\\,\\.]+ (?:{k}|{ud(k)}|{nounFraction}|{ud(nounFraction)})(?:e?s)?', inputSen, flags=IGNORECASE)    # all should be uncaptured. Use (?:) instead of ()   <- captured
                    for i in allInstances:
                        numberPart = fa(f'[0-9\\,\\.]+', i)
                        # Left or right ?
                        if len(numberPart) > 0:
                            if symbol.isalpha():  # Cyrillic and zloty are alphas !
                                rpmt = f'{s(trailingZeroRegex,"","{:,}".format(float(numberPart[0].replace(",","")) / v["fractional"]["numToBasic"]).replace(","," "))} {symbol}'
                            else:
                                rpmt = f'{symbol}{s(trailingZeroRegex,"","{:,}".format(float(numberPart[0].replace(",","")) / v["fractional"]["numToBasic"]).replace(","," "))}'
                            inputSen = s(i, rpmt, inputSen, flags=IGNORECASE)
        return inputSen
コード例 #3
0
ファイル: language.py プロジェクト: ChipaKraken/iRnWs
 def training(self, lang, path):
     data = dd(int)
     with open(path) as book:
         book = book.read()
         for word in fa('\w+', book):
             temp = word.lower()
             data['^' + temp[:1]] += 1
             data[temp[-1:] + '$'] += 1
             while len(temp[:2]) == 2:
                 data[temp[:2]] += 1
                 data[temp[:1]] += 1
                 temp = temp[1:]
     self.langs[lang] = dict(data)
コード例 #4
0
def findMatches(pattern):
    matches = []

    print "re pattern: %s" % pattern
    words = fa(pattern, txt)
    words = list(set(words))
    print "Possible matches: %d" % len(words)
    for i in words[:5]:
        print i
    if len(words) > 5:
        print '...'

    for word in words:
        if checkWord(word, [i for i in letters]):
            matches.append(word)
    matches = set(matches)
    print "Matches:"
    for i in matches:
        print i

    print "%d matches" % len(matches)
    return matches
コード例 #5
0
def download(th):
    print("thread ", th, " started")
    n = 0
    while file_lines:
        print("thread ", th, " working on image ", n)
        entry = file_lines.pop()
        cat = offsets_dict[int(fa("n(\d+)_*", entry[0])[0])].lemma_names()[0]
        directory = img_base_dir + cat
        img_name = cat + "_" + str(th) + "_" + str(n) + entry[1][-4:]
        print("thread ", th, "downloading: ", entry[1])
        try:
            img_data = requests.get(entry[1]).content
        except requests.exceptions.RequestException as e:
            print(e)
            continue
        img_path = os.path.join(directory, img_name)
        with open(img_path, 'wb') as handler:
            handler.write(img_data)
            handler.close()
        print("thread ", th, " image ", n, "done")
        n = n + 1
    print("thread ", th, "all done")
コード例 #6
0
def getPlural(key):
    # f'https://en.wiktionary.org/wiki/{123}'
    v = u(f'https://en.wikipedia.org/w/index.php?title={q(key)}&action=edit')
    h = v.read().decode('utf-8')
    soup = b(h, 'html.parser')
    infoBoxTry = soup.select('#wpTextbox1')
    if len(infoBoxTry) > 0:
        editArea = infoBoxTry[0].text
        infoBox = fa('(?<=plural = ).*', editArea)
        if len(infoBox) > 0:
            formatted = s(
                r'(\{.*\}|\(.*\)|\t| |&nbsp;|\'\'.*\'\'|.*\: |<br>|<!--.*-->|/.*|\{\{.*\|)',
                '', infoBox[0]
            )  # TODO: {{plainlist}} not supported, needs a workaround
            if len(formatted) > 1:
                print(f'{key}: {formatted.lower()}')
                return formatted.lower()
            else:
                return None
        else:
            return None
    else:
        return None
コード例 #7
0
    for word in words:
        if checkWord(word, [i for i in letters]):
            matches.append(word)
    matches = set(matches)
    print "Matches:"
    for i in matches:
        print i

    print "%d matches" % len(matches)
    return matches
    #print "Num words: %d" %len(fa('\n',txt))


letters = [i for i in sys.argv[1]]
puzzle = sys.argv[2]
letters += fa('[a-z]', puzzle)
print "letters: %s" % ''.join(letters)

patterns = []
alphabet = '[a-zA-Z]'
if not '*' in puzzle:
    patterns = ''.join([alphabet if i == '.' else i for i in puzzle])
else:
    for i in puzzle:
        # TODO: Figure out what to do here
        if i == '*':
            for cycle in range(2):
                patterns = patterns + [j + alphabet for j in patterns]
            #setOne = [j + alphabet for j in patterns]
            #setTwo = [j + alphabet*2 for j in patterns]
            #patterns = setOne + setTwo
コード例 #8
0
def main():
    for s in sys.stdin:
        print(max(fa(r"[A-Z]+", s), key=len))
コード例 #9
0
ファイル: TraitModel.py プロジェクト: DanielTitkov/emmy
def extract_features(text,
                     morph=pymorphy2.MorphAnalyzer(),
                     pos_types=lib['pos'],
                     uncert=lib['uncert'],
                     cert=lib['cert'],
                     quan=lib['quan'],
                     imper=lib['imper'],
                     racio=lib['racio'],
                     dimin=lib['dimin'],
                     extrem=lib['extrem'],
                     like=lib['like'],
                     dislike=lib['dislike'],
                     polite=lib['polite'],
                     obscene=lib['obscene'],
                     slang=lib['slang']):

    from re import findall as fa
    #length in chars and words
    len_char = len(text)
    len_word = len(text.split())
    len_sent = len(fa('[^\.\!\?]+[\.\!\?]', text))
    len_sent = len_sent if len_sent else 1
    pun = fa('[\.+,!\?:-]', text)
    n_pun = len(pun)
    braсket_list = fa('[\(\)]', text)

    #POS & grammem
    def parse_text(text, morph=morph):
        tokens = cleanse(text).split()
        return [morph.parse(t) for t in tokens]

    parsed_text = parse_text(text)
    pos_list = [str(p[0].tag.POS) for p in parsed_text]
    n_nouns = len([t for t in pos_list if t == 'NOUN'])
    n_verbs = len([t for t in pos_list if t == 'VERB'])
    n_ad = len([t for t in pos_list if t in ['ADJF', 'ADVB']])
    anim_list = [str(p[0].tag.animacy) for p in parsed_text]
    pers_list = [str(p[0].tag.person) for p in parsed_text]
    tns_list = [str(p[0].tag.tense) for p in parsed_text]
    asp_list = [str(p[0].tag.aspect) for p in parsed_text]

    r = lambda x: round(x, 4)
    d = lambda x, y: x / y if y else 0.0

    features = {
        #surface features
        'len_char':
        len_char,
        'len_word':
        len_word,
        'len_sent':
        len_sent,
        'm_len_word':
        r(len_char / len_word),
        'm_len_sent':
        r(len_word / len_sent),
        #punctuation
        'p_pun':
        r(len(pun) / len_char),
        'p_dot':
        r(d(len([i for i in pun if i == '.']), len(pun))),
        'p_qm':
        r(d(len([i for i in pun if i == '?']), len(pun))),
        'p_excl':
        r(d(len([i for i in pun if i == '!']), len(pun))),
        'p_comma':
        r(d(len([i for i in pun if i == ',']), len(pun))),
        'p_brkt':
        r(len(braсket_list) / len_char),
        'p_brkt_up':
        r(d(len([i for i in braсket_list if i == ')']), len(braсket_list))),
        #POS form
        'pos_form':
        ' '.join(pos_list),
        'pos_richness':
        len(set(pos_list)),
        #grammem features
        'p_anim':
        r(d(len([t for t in anim_list if t == 'anim']), n_nouns)),
        'p_1per':
        r(d(len([t for t in pers_list if t == '1per']), n_verbs)),
        'p_3per':
        r(d(len([t for t in pers_list if t == '3per']), n_verbs)),
        'p_past':
        r(d(len([t for t in tns_list if t == 'past']), n_verbs)),
        'p_fut':
        r(d(len([t for t in tns_list if t == 'futr']), n_verbs)),
        'p_pres':
        r(d(len([t for t in tns_list if t == 'pres']), n_verbs)),
        'p_perf':
        r(d(len([t for t in asp_list if t == 'perf']), n_verbs)),
        'p_conj':
        r(d(len(fa('\sбы?\s', text)), n_verbs)),
        #lexical features
        'p_uncert':
        r(len(fa('|'.join(uncert), text.lower())) / len_word),
        'p_cert':
        r(len(fa('|'.join(cert), text.lower())) / len_word),
        'p_quan':
        r(len(fa('|'.join(quan), text.lower())) / len_word),
        'p_imper':
        r(len(fa('|'.join(imper), text.lower())) / len_word),
        'p_racio':
        r(len(fa('|'.join(racio), text.lower())) / len_word),
        'p_dimin':
        r(len(fa('|'.join(dimin), text.lower())) / len_word),
        'p_extrem':
        r(len(fa('|'.join(extrem), text.lower())) / len_word),
        'p_like':
        r(len(fa('|'.join(like), text.lower())) / len_word),
        'p_dislike':
        r(len(fa('|'.join(dislike), text.lower())) / len_word),
        'p_polite':
        r(len(fa('|'.join(polite), text.lower())) / len_word),
        'p_obscene':
        r(len(fa('|'.join(obscene), text.lower())) / len_word),
        'p_slang':
        r(len(fa('|'.join(slang), text.lower())) / len_word)
    }

    for f in pos_types:
        features['p_' + f] = r(
            len([t for t in pos_list if t == f]) / len(pos_list))

    return features
コード例 #10
0
from re import findall as fa
import re
import sys

with open('BookOfMormonProcessed.csv') as f:
    s = f.readlines()

matchVerses = []

for verse in s:
    matches = fa('(?:\w+,? ){,7}command(?:ment)?s?(?:,? \w+){,7}', verse,
                 re.IGNORECASE)

    for i in matches:
        if fa(r'\bprosper\b|\bbless\b|\bpromise\b', verse, re.IGNORECASE):
            #    if fa(r'\bif\b',i, re.IGNORECASE) and fa(r'(?:\bkeep\b)|(?:\bobey\b)|(?:\bobedient\b)|(?:\bprosper\b)',i, re.IGNORECASE):
            print i
            matchVerses.append(verse)
            break

for i, j in enumerate(matchVerses):
    print i, j

print "Matches found:", len(matchVerses)
コード例 #11
0
'''
a b d e g o p q A D O P Q R 0 4 6 9  has only 1 bounded region
B 8 has 2 bounded regions.

ip: abcdef
op - 4 ( a,b,d,e has one bounded region i.e. 1*4 = 4)

'''

from re import findall as fa

ip = input()

bound1 = fa(r'[abdegopqADOPQR0469]', ip)
bound2 = fa(r'[B8]', ip)

#print(bound1,bound2)
print(len(bound1) + 2 * len(bound2))
コード例 #12
0
ファイル: concur.py プロジェクト: andreasKK/PyConCur
    def visit_xrates(self):
        """
        Scrapes x-rates.com and extracts currency-rates against 1 EUR. 
        Stores JSON file containing timestamp, currencies and respective rates.
        """
        try:
            request = get(URL)
            if request.status_code != 200:
                return False
        except:
            return False
        try:
            answer = html.fromstring(request.content)
        except NameError:
            print(
                '\npip module \'lxml\' not installed\nor installed for the wrong Python version'
            )
            exit()
        """
        uses xpaths from DOM to get what we need from the html 
        it'll contain each line in answer as an item in a list
        """
        names_ans = answer.xpath(NAME_PATH)
        from_cur_ans = answer.xpath(FROM_PATH)
        to_cur_ans = answer.xpath(TO_PATH)
        """
        converts the items of all three lists
        into type string to utilize string methods 
        """
        names_raw = [html.tostring(i) for i in names_ans]
        from_cur_raw = [html.tostring(i) for i in from_cur_ans]
        to_cur_raw = [html.tostring(i) for i in to_cur_ans]

        # contains all the currency abbreviations
        self.abb_names = [i for i in CURRENCY.keys()]
        """
        extracts the name from the html string
        it uses re.findall() to match currencies
        from utilities with the actual html string.
        if statement ensures that negative matches are ignored
        """
        self.names = []
        for check in CURRENCY.values():
            self.names.append([
                fa(check, str(name))[0] for name in names_raw
                if int(len(fa(check, str(name)))) > 0
            ])
        """
        uses find_floats() to extract
        floating numbers from html string
        """
        self.from_cur = [self.find_floats(i) for i in from_cur_raw]
        self.to_cur = [self.find_floats(i) for i in to_cur_raw]

        # concatenate names, from-rate and to-rate, against 1 EUR, in a list
        self.rates = [
            n + f + t
            for n, f, t in zip(self.names, self.from_cur, self.to_cur)
        ]

        # makes an alternative rates list with a timestamp
        rates_alt = {'timestamp': int(time()), 'rates': []}

        # the alt rates list uses abbreviations
        k = 0
        while k < len(self.rates):
            rates_alt['rates'].append({
                'currency': self.abb_names[k],
                'from': self.rates[k][1],
                'to': self.rates[k][2]
            })
            k += 1

        # data-source, displayed with output
        self.source = 'x-rates.com'

        # write it all to local file
        local_json = {
            'rates': self.rates,
            'rates_alt': rates_alt,
            'source': self.source
        }
        self.write_local(local_json)

        if not self.panda:
            return self.rates, rates_alt
        else:
            self.all_rates()
コード例 #13
0
ファイル: concur.py プロジェクト: andreasKK/PyConCur
 def find_floats(self, n):
     """
     extracts floating numbers string
     """
     return fa(r"[-+]?\d*\.\d+|\d+", str(n))
コード例 #14
0
ファイル: concur.py プロジェクト: andreasKK/PyConCur
    def visit_ecb(self):
        """
        Scrapes ecb.europa.eu and extracts currency-rates against 1 EUR. 
        
        Then stores JSON file containing timestamp,
        the currencies and the respective rates.
        """
        try:
            request = get(URL_ALT)
            if request.status_code != 200:
                return False
        except:
            return False
        try:
            answer = html.fromstring(request.content)
        except NameError:
            print(
                '\npip module \'lxml\' not installed\nor installed for the wrong Python version'
            )
            exit()
        """
        I could not manage to get the respective 
        xpaths for names, cur_from and cur_to but I did 
        manage to get all of them in one big string. 

        Thus, raw_list contains each html-line 
        with type string as an item in a list. 
        """
        raw = answer.xpath(ALT_PATH)
        raw_list = [html.tostring(i) for i in raw]
        """
        This is all extractions from visit_xrates()
        basically cramped into a nested for loop.
        Extrats name and 1 EUR to that given name,
        then finds the inverse of that and appends all of it. 
        """
        rates = []
        for item in raw_list:
            for curenncy in CURRENCY:
                if int(len(fa(curenncy, str(item)))) > 0:
                    rates.append({
                        'currency': ((([fa(curenncy, str(item))[0]])))[0],
                        'from':
                        self.find_floats(item)[0],
                        'to': (1 / float(self.find_floats(item)[0]))
                    })

        # makes the alternative rates list with currency abbreviations
        rates_alt = []
        k = 0
        while k < len(rates):
            rates_alt.append((CURRENCY[rates[k]['currency']], rates[k]['from'],
                              rates[k]['to']))
            k += 1
        """
        sort() returns TypeError on rates
        thus using the value for key 'currency' 
        in order to sort alphabetically
        """
        rates.sort(key=lambda value: value['currency'], reverse=False)
        rates_alt.sort()

        self.source = 'ecb.europa.eu'
        """
        adds timestamp to make sure the two scraping functions
        saves in the exact same structure for consistent extraction
        """
        local_json = {
            'rates': rates_alt,
            'rates_alt': {
                'timestamp': int(time()),
                'rates': rates
            },
            'source': self.source
        }

        # write it all to local file
        self.write_local(local_json)

        if not self.panda:
            return rates_alt, rates
        else:
            self.all_rates()