def build_term(term): term_obj = Term() for variable, terminal in term: if variable: symbol_obj = Symbol(variable, True) elif terminal: symbol_obj = Symbol(terminal, False) else: print("WHY THE HELL ARE YOU HERE?! building term") term_obj.add(symbol_obj) return term_obj
def _get_term(data): if 'id' in data: term = Term.objects.get(pk=data['id']) else: term = Term() term.name_en = data.get('name_en', '') term.name_la = data.get('name_la', '') term.name_cs = data.get('name_cs', '') term.system = data.get('system', '') term.bodypart = data.get('body_part', '') term.fma_id = data.get('fma_id', -1) term.save() return term
def get_translations(data, text, src_lang): # TODO fix dictionary map for all languages dmap = { 'ru': 'english-russian', 'fr': 'english-french', 'de': 'english-german', } for lang, dictionary in dmap.items(): pat = '{0}/dictionary/{1}/{2}' url = pat.format(base, dictionary, text.replace(' ', '-')) resp = requests.get(url, headers=headers) resp.raise_for_status() soup = BeautifulSoup(resp.text, 'html.parser') for sense in soup.find_all('div', class_='sense-body'): phrase = sense.find('div', class_='phrase-block') if phrase: continue trans = sense.find('span', class_='trans') if trans: for word in stripped_text(trans).split(','): term = Term(text=word, lang=lang, region=None) data['translated_as'].append(term) return data
def get_translations(text, src_lang): # TODO fix dictionary map for all languages dmap = { 'ru': 'english-russian', 'fr': 'english-french', 'de': 'english-german', } txt = text.replace(' ', '-') for lang, dictionary in dmap.items(): url = f'{base}/dictionary/{dictionary}/{txt}' resp = requests.get(url, headers=headers) resp.raise_for_status() soup = BeautifulSoup(resp.text, 'html.parser') for sense in soup.find_all('div', class_='sense-body'): phrase = sense.find('div', class_='phrase-block') if phrase: continue trans = sense.find('span', class_='trans') if trans: words = stripped_text(trans).split(',') words = [w for w in words if not is_empty(w)] for word in words: term = Term(text=word, lang=lang, region=None) yield ('translated_as', term)
def get_data(query, lang): if lang != 'en': return None url = f'https://www.macmillandictionary.com/dictionary/british/{query}' headers = { 'User-Agent': 'script', 'Accept': 'text/html', } resp = requests.get(url, headers=headers) resp.raise_for_status() soup = BeautifulSoup(resp.text, 'html.parser') #get transcription transcriptions = soup.find_all(class_='PRON') for t in transcriptions: yield ('transcription', Term(text=stripped_text(t).replace('/', ''), lang=lang, region=None)) #get tags crop_text = stripped_text(soup.find(class_='zwsp')) #remove bad substring part_speech = stripped_text(soup.find(class_='PART-OF-SPEECH')).replace( crop_text, '') syntax_coding = stripped_text(soup.find(class_='SYNTAX-CODING')) yield ('tag', Term(text=part_speech, lang=lang, region=None)) yield ('tag', Term(text=syntax_coding, lang=lang, region=None)) #get defenition defenitions = soup.find_all(class_='DEFINITION') for d in defenitions: yield ('definition', Term(text=stripped_text(d), lang=lang, region=None)) #get examples examples = soup.find_all(class_='EXAMPLES') for e in examples: yield ('in', Term(text=stripped_text(e), lang=lang, region=None)) examples = soup.find_all(class_='PHR-XREF') for e in examples: yield ('in', Term(text=stripped_text(e), lang=lang, region=None)) #get synonyms synonyms = soup.find_all(class_='synonyms') for allsyn in synonyms: subsynonyms = allsyn.find_all(class_='theslink') for syn in subsynonyms: if (not '...' in syn.text): yield ('synonym', Term(text=stripped_text(syn), lang=lang, region=None)) #get audio audio = soup.find(class_='audio_play_button') yield ('audio', File(url=audio['data-src-mp3'], region=None)) yield ('audio', File(url=audio['data-src-ogg'], region=None))
def define_word(text, lang='en', source_idx=-1, count=1): term_id = define_term(Term(text=text, lang=lang, region=None)) source_list = sources if source_idx < 0 else sources[ source_idx:source_idx + count] for source in source_list: data = get_data_safe(source, text, lang) if data is None: sys.exit(-1) push_data(term_id, data)
def parse_phrase_row(row, lang, trans_lang, tags): def parse_td(td): # if 'class' not in td.attrs: # return None # k = td.attrs['class'] # if k not in ['phraselist1', 'phraselist2']: # return None a = td.find('a') return stripped_text(a) result = [parse_td(t) for t in row.find_all('td')] if len(result) != 2: return [] if any(t is None for t in result): return [] term = Term(text=result[0], lang=lang, region=None) trans = Term(text=result[1], lang=trans_lang, region=None) return [TermWithData(term, {'tag': tags, 'translated_as': [trans]})]
def achievements(request): current_term = Term.current_term_key() active = Participant.objects.filter(is_active=True) with_awards = active.filter(grant__term_id=current_term ).annotate(num_grants=Count('grant'), time=Max('grant__granted') ).order_by('-num_grants', '-time') without_awards = active.exclude(grant__term_id=current_term) hide_nominate_link = request.REQUEST.get('hide_nominate_links', False) return render_to_response('achievements.html', {'participants': itertools.chain(with_awards, without_awards), 'show_nominate_link': not hide_nominate_link}, context_instance=RequestContext(request))
def parse_thesaurus(lang, page): soup = BeautifulSoup(page, 'html.parser') dlist = soup.find_all('span', class_='syn-list') for d in dlist: synonyms = d.find_all('a') for s in synonyms: yield ('synonym', Term(text=stripped_text(s), lang=lang, region=None)) dlist = soup.find_all('span', class_='rel-list') for d in dlist: related = d.find_all('a') for r in related: yield ('related', Term(text=stripped_text(r), lang=lang, region=None)) dlist = soup.find_all('span', class_='ant-list') for d in dlist: antonyms = d.find_all('a') for r in antonyms: yield ('antonym', Term(text=stripped_text(r), lang=lang, region=None))
def achievements(request): current_term = Term.current_term_key() active = Participant.objects.filter(is_active=True) with_awards = active.filter(grant__term_id=current_term).annotate( num_grants=Count('grant'), time=Max('grant__granted')).order_by('-num_grants', '-time') without_awards = active.exclude(grant__term_id=current_term) hide_nominate_link = request.REQUEST.get('hide_nominate_links', False) return render_to_response( 'achievements.html', { 'participants': itertools.chain(with_awards, without_awards), 'show_nominate_link': not hide_nominate_link }, context_instance=RequestContext(request))
def __init__(self, *args, **kwargs): super(NominatePersonForm, self).__init__(*args, **kwargs) this_term = Term.current_term_key() term_grants = Grant.objects.filter(\ participant = self.instance.participant,\ term = this_term) term_user_nominations = Nomination.objects.filter(\ nominator = self.instance.nominator, participant = self.instance.participant, term = this_term) self.fields['achievement'] = forms.ModelChoiceField(\ queryset = Achievement.objects.filter(\ can_nominate=True).exclude(\ grant__in = term_grants).exclude(\ nomination__in = term_user_nominations))
def get_data(query, lang): if lang != 'en': return None data = { 'audio': [], #'visual': [], 'tag': [], 'transcription': [], 'definition': [], 'in': [], 'synonym': [], 'antonym': [], 'related': [] } pat = 'https://www.merriam-webster.com/dictionary/{0}' url = pat.format(query) headers = { 'User-Agent': 'script', 'Accept': 'text/html', } resp = requests.get(url, headers=headers) resp.raise_for_status() soup = BeautifulSoup(resp.text, 'html.parser') #find transcription and audio prs = soup.find('span', class_='prs') transcription = prs.find('span', class_='pr') transcription = stripped_text(transcription) data['transcription'].append(transcription) btns = prs.find_all('a', class_='play-pron') urls = [parse_btn(b) for b in btns] urls = [u for u in urls if utils.url_exists(u)] for url in urls: data['audio'].append(File(url=url, region=None)) #find definitions and 'in' vg = soup.find_all('div', class_='vg') for v in vg: definitions = v.find_all(class_='dt') for d in definitions: text = stripped_text(d) #all defenitions start with ':' with class mw_t_bc if (d.find(class_='mw_t_bc') is not None): text = text.lstrip(':').strip() #with defenitions we can take examples of text with class ex-sent, we need drop it if (d.find(class_='ex-sent') is not None): text = text.split('\n')[0].strip() data['definition'].append( Term(text=text, lang=lang, region=None)) #parse examples data_in = soup.find_all(class_='ex-sent') for d in data_in: if ('t' in d['class']): data['in'].append( Term(text=stripped_text(d), lang=lang, region=None)) #parse related ure = soup.find_all(class_='ure') for d in ure: data['related'].append( Term(text=stripped_text(d), lang=lang, region=None)) #parse tags tag = soup.find_all('span', class_='fl') for d in tag: data['tag'].append(Term(text=stripped_text(d), lang=lang, region=None)) #add tag with name 'word', becouse our name is word data['tag'].append(Term(text='word', lang=lang, region=None)) #move to second page, in teasaurus pat_t = 'https://www.merriam-webster.com/thesaurus/{0}' url_t = pat_t.format(query) resp = requests.get(url_t, headers=headers) resp.raise_for_status() soup = BeautifulSoup(resp.text, 'html.parser') dlist = soup.find_all('span', class_='syn-list') for d in dlist: synonyms = d.find_all('a') for s in synonyms: data['synonym'].append( Term(text=stripped_text(s), lang=lang, region=None)) dlist = soup.find_all('span', class_='rel-list') for d in dlist: related = d.find_all('a') for r in related: data['related'].append( Term(text=stripped_text(r), lang=lang, region=None)) dlist = soup.find_all('span', class_='ant-list') for d in dlist: antonyms = d.find_all('a') for r in antonyms: data['antonym'].append( Term(text=stripped_text(r), lang=lang, region=None)) return data
def define_word(text, lang='en', source_idx=-1): term_id = define_term(Term(text=text, lang=lang, region=None)) source_list = sources if source_idx < 0 else [sources[source_idx]] for source in source_list: data = source.get_data(text, lang) push_data(term_id, data)
def get_data(query, lang): if lang != 'en': return url = f'https://www.merriam-webster.com/dictionary/{query}' headers = { 'User-Agent': 'script', 'Accept': 'text/html', } resp = requests.get(url, headers=headers) resp.raise_for_status() soup = BeautifulSoup(resp.text, 'html.parser') #find transcription and audio prs = soup.find('span', class_='prs') if prs is not None: transcription = prs.find('span', class_='pr') transcription = stripped_text(transcription) yield ('transcription', Term(text=transcription, lang='ipa', region=None)) btns = prs.find_all('a', class_='play-pron') urls = [parse_btn(b) for b in btns] urls = [u for u in urls if utils.url_exists(u)] for url in urls: yield ('audio', File(url=url, region=None)) #find definitions and 'in' vg = soup.find_all('div', class_='vg') for v in vg: definitions = v.find_all(class_='dt') for d in definitions: text = stripped_text(d) #all defenitions start with ':' with class mw_t_bc if (d.find(class_='mw_t_bc') is not None): text = text.lstrip(':').strip() #with defenitions we can take examples of text with class ex-sent, we need drop it if (d.find(class_='ex-sent') is not None): text = text.split('\n')[0].strip() yield ('definition', Term(text=text, lang=lang, region=None)) #parse examples data_in = soup.find_all(class_='ex-sent') for d in data_in: if ('t' in d['class']): yield ('in', Term(text=stripped_text(d), lang=lang, region=None)) #parse related ure = soup.find_all(class_='ure') for d in ure: yield ('related', Term(text=stripped_text(d), lang=lang, region=None)) #parse tags tag = soup.find_all('span', class_='fl') for d in tag: yield ('tag', Term(text=stripped_text(d), lang=lang, region=None)) #add tag with name 'word', becouse our name is word yield ('tag', Term(text='word', lang=lang, region=None)) #move to second page, in teasaurus url_t = f'https://www.merriam-webster.com/thesaurus/{query}' resp = requests.get(url_t, headers=headers) if resp.ok: for t in parse_thesaurus(lang, resp.text): yield t
import sys import utils import requests import json from bs4 import BeautifulSoup from models import Term, TermWithData headers = { 'User-Agent': utils.CHROME_USER_AGENT, 'Accept': 'text/html', } # TODO consider collecting automatically https://www.multitran.com/m.exe?s=place&l1=2&l2=1&fl=1 categories = [{ 'tag': [ Term(text='idiom', lang='en', region=None), Term(text='идиома', lang='ru', region=None), ], 'id': 895, }, { 'tag': [ Term(text='proverb', lang='en', region=None), Term(text='пословица', lang='ru', region=None), ], 'id': 310, }, { 'tag': [ Term(text='americanism', lang='en', region=None), Term(text='американизм', lang='ru', region=None),
def query(term_id): query = (Term.select(Term.title).where(Term.id == term_id).dicts()) return query
def get_data(text, lang): if lang != 'en': return txt = text.replace(' ', '-') url = f'{base}/dictionary/english/{txt}' resp = requests.get(url, headers=headers) resp.raise_for_status() codes = { 'C': 'countable', 'U': 'uncountable', 'S': 'singular', } posgram_found = False gram_found = False if utils.is_word(text): yield ('tag', Term(text='word', lang=lang, region=None)) soup = BeautifulSoup(resp.text, 'html.parser') page = soup.find('div', class_='page') for dictionary in page.find_all('div', class_='dictionary'): header = dictionary.find('div', class_='pos-header') body = dictionary.find('div', class_='pos-body') posgram = header.find('div', class_='posgram') if posgram and not posgram_found: pos = find_strip(posgram, 'span', class_='pos') term = Term(text=pos, lang=lang, region=None) yield ('tag', term) posgram_found = True if not gram_found: for gram in body.find_all('span', class_='gram'): for gc in gram.find_all('span', class_='gc'): code = stripped_text(gc) if code in codes and not gram_found: term = Term(text=codes[code], lang=lang, region=None) yield ('tag', term) gram_found = True # parse pronunciations for dpron in header.find_all('span', class_='dpron-i'): region = find_strip(dpron, 'span', 'region') amp = header.find('amp-audio') for source in amp.find_all('source'): file = File(url=base + source.attrs['src'], region=region) yield ('audio', file) ipa = find_strip(dpron, 'span', class_='ipa') if not is_empty(ipa): yield ('transcription', Term(text=ipa, lang=lang, region=region)) for dblock in body.find_all('div', class_='def-block'): def_text = stripped_text(dblock.find('div', class_='def')) if not is_empty(def_text): yield ('definition', Term(text=def_text, lang=lang, region=None)) img = dblock.find('amp-img') if img is not None: file = File(url=base + img.attrs['src'], region=None) yield ('visual', file) for eg in dblock.find_all('span', 'eg'): term = Term(text=stripped_text(eg), lang=lang, region=None) yield ('in', term) for dataset in page.find_all('div', class_='dataset'): for eg in dataset.find_all('span', class_='deg'): term = Term(text=stripped_text(eg), lang=lang, region=None) yield ('in', term) cpegs = dataset.find('div', class_='cpegs') if cpegs: for lbb in cpegs.find_all('div', class_='lbb'): for a in lbb.find_all('a', class_='hdib'): term = Term(text=stripped_text(a), lang=lang, region=None) yield ('collocation', term) for t in get_translations(text, lang): yield t
def get_all_terms(self): query = (Term.select(Term.title, Term.id).dicts()) return query