def check(self, sentence): result = dict.fromkeys(self.langs.keys(), 1) for word in fa('\w+', sentence): temp = '^' + word.lower() + '$' while len(temp[:2]) == 2: for lang in self.langs.keys(): result[lang] *= self.check_word(temp, lang) temp = temp[1:] return max(result, key=lambda k: result[k])
def toSymbol(self, inputSen): doneCurrencyNames = self.doneCurrencyNames for k, v in doneCurrencyNames.items(): if v['symbol'] is not None: if isinstance(v['symbol'],list): symbol = v['symbol'][0] else: symbol = v['symbol'] nounCurrency = s('^.* ','',k,flags=IGNORECASE) # fm('[a-z]+', k, flags=IGNORECASE) != None if ' ' not in k: # exception: one-word currency: euro, bitcoin nounCurrency = k if v['plural'] is not None: pluralPrefix = f'{v["plural"]}|{ud(v["plural"])}|' # orders matter. It's always {plural|singular} rather than {singular|plural} else: pluralPrefix = '' allInstances = fa(f'[0-9\\,\\.]+ (?:{pluralPrefix}{k}|{ud(k)}|{nounCurrency}|{ud(nounCurrency)})(?:e?s)?', inputSen, flags=IGNORECASE) # all should be uncaptured. Use (?:) instead of () <- captured trailingZeroRegex = r"\.0$" for i in allInstances: numberPart = fa(f'[0-9\\,\\.]+', i) # Left or right ? if len(numberPart) > 0: if symbol.isalpha(): # Cyrillic and zloty are alphas ! # rpmt = f'{numberPart[0]} {symbol}' rpmt = f'{s(trailingZeroRegex,"","{:,}".format(float(numberPart[0].replace(",",""))).replace(","," "))} {symbol}' else: # rpmt = f'{symbol}{numberPart[0]}' rpmt = f'{symbol}{s(trailingZeroRegex,"","{:,}".format(float(numberPart[0].replace(",",""))).replace(","," "))}' inputSen = s(i, rpmt, inputSen, flags=IGNORECASE) # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ if v['fractional']['name'] is not None: nounFraction = v['fractional']['name'] allInstances = fa(f'[0-9\\,\\.]+ (?:{k}|{ud(k)}|{nounFraction}|{ud(nounFraction)})(?:e?s)?', inputSen, flags=IGNORECASE) # all should be uncaptured. Use (?:) instead of () <- captured for i in allInstances: numberPart = fa(f'[0-9\\,\\.]+', i) # Left or right ? if len(numberPart) > 0: if symbol.isalpha(): # Cyrillic and zloty are alphas ! rpmt = f'{s(trailingZeroRegex,"","{:,}".format(float(numberPart[0].replace(",","")) / v["fractional"]["numToBasic"]).replace(","," "))} {symbol}' else: rpmt = f'{symbol}{s(trailingZeroRegex,"","{:,}".format(float(numberPart[0].replace(",","")) / v["fractional"]["numToBasic"]).replace(","," "))}' inputSen = s(i, rpmt, inputSen, flags=IGNORECASE) return inputSen
def training(self, lang, path): data = dd(int) with open(path) as book: book = book.read() for word in fa('\w+', book): temp = word.lower() data['^' + temp[:1]] += 1 data[temp[-1:] + '$'] += 1 while len(temp[:2]) == 2: data[temp[:2]] += 1 data[temp[:1]] += 1 temp = temp[1:] self.langs[lang] = dict(data)
def findMatches(pattern): matches = [] print "re pattern: %s" % pattern words = fa(pattern, txt) words = list(set(words)) print "Possible matches: %d" % len(words) for i in words[:5]: print i if len(words) > 5: print '...' for word in words: if checkWord(word, [i for i in letters]): matches.append(word) matches = set(matches) print "Matches:" for i in matches: print i print "%d matches" % len(matches) return matches
def download(th): print("thread ", th, " started") n = 0 while file_lines: print("thread ", th, " working on image ", n) entry = file_lines.pop() cat = offsets_dict[int(fa("n(\d+)_*", entry[0])[0])].lemma_names()[0] directory = img_base_dir + cat img_name = cat + "_" + str(th) + "_" + str(n) + entry[1][-4:] print("thread ", th, "downloading: ", entry[1]) try: img_data = requests.get(entry[1]).content except requests.exceptions.RequestException as e: print(e) continue img_path = os.path.join(directory, img_name) with open(img_path, 'wb') as handler: handler.write(img_data) handler.close() print("thread ", th, " image ", n, "done") n = n + 1 print("thread ", th, "all done")
def getPlural(key): # f'https://en.wiktionary.org/wiki/{123}' v = u(f'https://en.wikipedia.org/w/index.php?title={q(key)}&action=edit') h = v.read().decode('utf-8') soup = b(h, 'html.parser') infoBoxTry = soup.select('#wpTextbox1') if len(infoBoxTry) > 0: editArea = infoBoxTry[0].text infoBox = fa('(?<=plural = ).*', editArea) if len(infoBox) > 0: formatted = s( r'(\{.*\}|\(.*\)|\t| | |\'\'.*\'\'|.*\: |<br>|<!--.*-->|/.*|\{\{.*\|)', '', infoBox[0] ) # TODO: {{plainlist}} not supported, needs a workaround if len(formatted) > 1: print(f'{key}: {formatted.lower()}') return formatted.lower() else: return None else: return None else: return None
for word in words: if checkWord(word, [i for i in letters]): matches.append(word) matches = set(matches) print "Matches:" for i in matches: print i print "%d matches" % len(matches) return matches #print "Num words: %d" %len(fa('\n',txt)) letters = [i for i in sys.argv[1]] puzzle = sys.argv[2] letters += fa('[a-z]', puzzle) print "letters: %s" % ''.join(letters) patterns = [] alphabet = '[a-zA-Z]' if not '*' in puzzle: patterns = ''.join([alphabet if i == '.' else i for i in puzzle]) else: for i in puzzle: # TODO: Figure out what to do here if i == '*': for cycle in range(2): patterns = patterns + [j + alphabet for j in patterns] #setOne = [j + alphabet for j in patterns] #setTwo = [j + alphabet*2 for j in patterns] #patterns = setOne + setTwo
def main(): for s in sys.stdin: print(max(fa(r"[A-Z]+", s), key=len))
def extract_features(text, morph=pymorphy2.MorphAnalyzer(), pos_types=lib['pos'], uncert=lib['uncert'], cert=lib['cert'], quan=lib['quan'], imper=lib['imper'], racio=lib['racio'], dimin=lib['dimin'], extrem=lib['extrem'], like=lib['like'], dislike=lib['dislike'], polite=lib['polite'], obscene=lib['obscene'], slang=lib['slang']): from re import findall as fa #length in chars and words len_char = len(text) len_word = len(text.split()) len_sent = len(fa('[^\.\!\?]+[\.\!\?]', text)) len_sent = len_sent if len_sent else 1 pun = fa('[\.+,!\?:-]', text) n_pun = len(pun) braсket_list = fa('[\(\)]', text) #POS & grammem def parse_text(text, morph=morph): tokens = cleanse(text).split() return [morph.parse(t) for t in tokens] parsed_text = parse_text(text) pos_list = [str(p[0].tag.POS) for p in parsed_text] n_nouns = len([t for t in pos_list if t == 'NOUN']) n_verbs = len([t for t in pos_list if t == 'VERB']) n_ad = len([t for t in pos_list if t in ['ADJF', 'ADVB']]) anim_list = [str(p[0].tag.animacy) for p in parsed_text] pers_list = [str(p[0].tag.person) for p in parsed_text] tns_list = [str(p[0].tag.tense) for p in parsed_text] asp_list = [str(p[0].tag.aspect) for p in parsed_text] r = lambda x: round(x, 4) d = lambda x, y: x / y if y else 0.0 features = { #surface features 'len_char': len_char, 'len_word': len_word, 'len_sent': len_sent, 'm_len_word': r(len_char / len_word), 'm_len_sent': r(len_word / len_sent), #punctuation 'p_pun': r(len(pun) / len_char), 'p_dot': r(d(len([i for i in pun if i == '.']), len(pun))), 'p_qm': r(d(len([i for i in pun if i == '?']), len(pun))), 'p_excl': r(d(len([i for i in pun if i == '!']), len(pun))), 'p_comma': r(d(len([i for i in pun if i == ',']), len(pun))), 'p_brkt': r(len(braсket_list) / len_char), 'p_brkt_up': r(d(len([i for i in braсket_list if i == ')']), len(braсket_list))), #POS form 'pos_form': ' '.join(pos_list), 'pos_richness': len(set(pos_list)), #grammem features 'p_anim': r(d(len([t for t in anim_list if t == 'anim']), n_nouns)), 'p_1per': r(d(len([t for t in pers_list if t == '1per']), n_verbs)), 'p_3per': r(d(len([t for t in pers_list if t == '3per']), n_verbs)), 'p_past': r(d(len([t for t in tns_list if t == 'past']), n_verbs)), 'p_fut': r(d(len([t for t in tns_list if t == 'futr']), n_verbs)), 'p_pres': r(d(len([t for t in tns_list if t == 'pres']), n_verbs)), 'p_perf': r(d(len([t for t in asp_list if t == 'perf']), n_verbs)), 'p_conj': r(d(len(fa('\sбы?\s', text)), n_verbs)), #lexical features 'p_uncert': r(len(fa('|'.join(uncert), text.lower())) / len_word), 'p_cert': r(len(fa('|'.join(cert), text.lower())) / len_word), 'p_quan': r(len(fa('|'.join(quan), text.lower())) / len_word), 'p_imper': r(len(fa('|'.join(imper), text.lower())) / len_word), 'p_racio': r(len(fa('|'.join(racio), text.lower())) / len_word), 'p_dimin': r(len(fa('|'.join(dimin), text.lower())) / len_word), 'p_extrem': r(len(fa('|'.join(extrem), text.lower())) / len_word), 'p_like': r(len(fa('|'.join(like), text.lower())) / len_word), 'p_dislike': r(len(fa('|'.join(dislike), text.lower())) / len_word), 'p_polite': r(len(fa('|'.join(polite), text.lower())) / len_word), 'p_obscene': r(len(fa('|'.join(obscene), text.lower())) / len_word), 'p_slang': r(len(fa('|'.join(slang), text.lower())) / len_word) } for f in pos_types: features['p_' + f] = r( len([t for t in pos_list if t == f]) / len(pos_list)) return features
from re import findall as fa import re import sys with open('BookOfMormonProcessed.csv') as f: s = f.readlines() matchVerses = [] for verse in s: matches = fa('(?:\w+,? ){,7}command(?:ment)?s?(?:,? \w+){,7}', verse, re.IGNORECASE) for i in matches: if fa(r'\bprosper\b|\bbless\b|\bpromise\b', verse, re.IGNORECASE): # if fa(r'\bif\b',i, re.IGNORECASE) and fa(r'(?:\bkeep\b)|(?:\bobey\b)|(?:\bobedient\b)|(?:\bprosper\b)',i, re.IGNORECASE): print i matchVerses.append(verse) break for i, j in enumerate(matchVerses): print i, j print "Matches found:", len(matchVerses)
''' a b d e g o p q A D O P Q R 0 4 6 9 has only 1 bounded region B 8 has 2 bounded regions. ip: abcdef op - 4 ( a,b,d,e has one bounded region i.e. 1*4 = 4) ''' from re import findall as fa ip = input() bound1 = fa(r'[abdegopqADOPQR0469]', ip) bound2 = fa(r'[B8]', ip) #print(bound1,bound2) print(len(bound1) + 2 * len(bound2))
def visit_xrates(self): """ Scrapes x-rates.com and extracts currency-rates against 1 EUR. Stores JSON file containing timestamp, currencies and respective rates. """ try: request = get(URL) if request.status_code != 200: return False except: return False try: answer = html.fromstring(request.content) except NameError: print( '\npip module \'lxml\' not installed\nor installed for the wrong Python version' ) exit() """ uses xpaths from DOM to get what we need from the html it'll contain each line in answer as an item in a list """ names_ans = answer.xpath(NAME_PATH) from_cur_ans = answer.xpath(FROM_PATH) to_cur_ans = answer.xpath(TO_PATH) """ converts the items of all three lists into type string to utilize string methods """ names_raw = [html.tostring(i) for i in names_ans] from_cur_raw = [html.tostring(i) for i in from_cur_ans] to_cur_raw = [html.tostring(i) for i in to_cur_ans] # contains all the currency abbreviations self.abb_names = [i for i in CURRENCY.keys()] """ extracts the name from the html string it uses re.findall() to match currencies from utilities with the actual html string. if statement ensures that negative matches are ignored """ self.names = [] for check in CURRENCY.values(): self.names.append([ fa(check, str(name))[0] for name in names_raw if int(len(fa(check, str(name)))) > 0 ]) """ uses find_floats() to extract floating numbers from html string """ self.from_cur = [self.find_floats(i) for i in from_cur_raw] self.to_cur = [self.find_floats(i) for i in to_cur_raw] # concatenate names, from-rate and to-rate, against 1 EUR, in a list self.rates = [ n + f + t for n, f, t in zip(self.names, self.from_cur, self.to_cur) ] # makes an alternative rates list with a timestamp rates_alt = {'timestamp': int(time()), 'rates': []} # the alt rates list uses abbreviations k = 0 while k < len(self.rates): rates_alt['rates'].append({ 'currency': self.abb_names[k], 'from': self.rates[k][1], 'to': self.rates[k][2] }) k += 1 # data-source, displayed with output self.source = 'x-rates.com' # write it all to local file local_json = { 'rates': self.rates, 'rates_alt': rates_alt, 'source': self.source } self.write_local(local_json) if not self.panda: return self.rates, rates_alt else: self.all_rates()
def find_floats(self, n): """ extracts floating numbers string """ return fa(r"[-+]?\d*\.\d+|\d+", str(n))
def visit_ecb(self): """ Scrapes ecb.europa.eu and extracts currency-rates against 1 EUR. Then stores JSON file containing timestamp, the currencies and the respective rates. """ try: request = get(URL_ALT) if request.status_code != 200: return False except: return False try: answer = html.fromstring(request.content) except NameError: print( '\npip module \'lxml\' not installed\nor installed for the wrong Python version' ) exit() """ I could not manage to get the respective xpaths for names, cur_from and cur_to but I did manage to get all of them in one big string. Thus, raw_list contains each html-line with type string as an item in a list. """ raw = answer.xpath(ALT_PATH) raw_list = [html.tostring(i) for i in raw] """ This is all extractions from visit_xrates() basically cramped into a nested for loop. Extrats name and 1 EUR to that given name, then finds the inverse of that and appends all of it. """ rates = [] for item in raw_list: for curenncy in CURRENCY: if int(len(fa(curenncy, str(item)))) > 0: rates.append({ 'currency': ((([fa(curenncy, str(item))[0]])))[0], 'from': self.find_floats(item)[0], 'to': (1 / float(self.find_floats(item)[0])) }) # makes the alternative rates list with currency abbreviations rates_alt = [] k = 0 while k < len(rates): rates_alt.append((CURRENCY[rates[k]['currency']], rates[k]['from'], rates[k]['to'])) k += 1 """ sort() returns TypeError on rates thus using the value for key 'currency' in order to sort alphabetically """ rates.sort(key=lambda value: value['currency'], reverse=False) rates_alt.sort() self.source = 'ecb.europa.eu' """ adds timestamp to make sure the two scraping functions saves in the exact same structure for consistent extraction """ local_json = { 'rates': rates_alt, 'rates_alt': { 'timestamp': int(time()), 'rates': rates }, 'source': self.source } # write it all to local file self.write_local(local_json) if not self.panda: return rates_alt, rates else: self.all_rates()