def extract_info(article): ''' INPUT: dict object with output from the api OUTPUT: bool if extraction was successful or not, dict object to insert into mongodb ''' headline = unidecode(article['webTitle']) date_published = str(article['webPublicationDate']) try: author = article['blocks']['body'][0]['createdBy']['firstName'] + ' ' + article['blocks']['body'][0]['createdBy']['lastName'] except: author = None try: url = str(article['webUrl']) except: return False, '' try: article_text = '\n'.join([unidecode(text_block['bodyTextSummary']) for text_block in article['blocks']['body']]) except: return False, '' insert = {'url': url, 'source': 'guardian', 'headline': headline, 'date_published': date_published, 'author': author, 'article_text': article_text} return True, insert
def page_url(corpus, ctx_type, book_path, book_id, jsonfile): """ Modified htrc_*_label_fn. The individual volumes don't have 'book' as a context type. """ import json from vsm.viewer import doc_label_name import re urls = [] corp_md = corpus.view_metadata("page") jsonpath = os.path.join(book_path, jsonfile) with open(jsonpath, "r") as f: md = json.load(f) url = "" li = sorted(md["items"], key=lambda k: int(k["lastUpdate"])) url = li[-1]["itemURL"] if ctx_type == "book": urls.append(unidecode(url)) else: # urls for pages page_md = corpus.view_metadata("page") files = page_md[doc_label_name("page")] nums = [re.findall("[1-9][0-9]*", a)[-1] for a in files] for i in nums: s = url + "?urlappend=%3Bseq={0}".format(i) urls.append(unidecode(s)) return urls
def get_items_by_letter(self): items_by_letter = OrderedDict() for letter, items in groupby(self.get_items(), self.sortkey): items_by_letter[unidecode(letter)] = tuple(sorted(items, key=lambda i: unidecode(self.get_title(i)))) return items_by_letter
def crawl_video_urls(url='http://documentaryheaven.com/category/space/'): myopener = MyOpener() page = myopener.open(url) page = page.read() html = BeautifulSoup(page, "lxml") # find all class=post posts = html.find_all('div', class_="post") # for each class=post: for p in posts: obj = {} #class=post-title --> a (href, string) title = p.find('h2').find('a') obj['url'] = title['href'] obj['title'] = unidecode(title.string) abstract = p.find('div', class_='browse-description').find('p') obj['abstract'] = unidecode(abstract.string).replace('\n', '').replace('\r\r', ' ').strip() #class=browse-description --> p (string) results.append(obj) # next page: class=next --> (href) next_page = html.find('a', class_="next page-numbers") if not next_page: return None print results print next_page['href'] return crawl_video_urls(url=next_page['href'])
def extract_info(article): ''' INPUT: dict object with output from the api OUTPUT: bool if extraction was successful or not, dict object to insert into mongodb ''' headline = unidecode(article['title']['$text']) date_published = str(article['pubDate']['$text']) try: author = [str(author['name']['$text']) for author in article['byline']] except: author = None try: url = str(article['link'][0]['$text']) except: return False, '' try: article_text = unidecode(' '.join([line.get('$text', '\n') for line in article['text']['paragraph']])) except: return False, '' insert = {'url': url, 'source': 'npr', 'headline': headline, 'date_published': date_published, 'author': author, 'article_text': article_text} return True, insert
def save(self, force_insert=False, force_update=False, using=None, update_fields=None): if update_generated_value(self, 'name', 'search_name'): self.search_name = unidecode(self.name) if update_generated_value(self, 'surname', 'search_surname'): self.search_surname = unidecode(self.surname) super(KnownPlayer, self).save(force_insert, force_update, using, update_fields)
def test_ascii(self): log = [] def showwarning_new(message, category, *args): if ("not an unicode object" in str(message)) and \ (category is RuntimeWarning): log.append((message, category)) else: showwarning_old(message, category, *args) showwarning_old = warnings.showwarning warnings.showwarning = showwarning_new warnings.filterwarnings("always") for n in xrange(0,128): t = chr(n) self.assertEqual(unidecode(t), t) # Passing string objects to unidecode should raise a warning self.assertEqual(128, len(log)) log = [] for n in xrange(0,128): t = unichr(n) self.assertEqual(unidecode(t), t) # unicode objects shouldn't raise warnings self.assertEqual(0, len(log)) warnings.showwarning = showwarning_old
def parsefile(f,inPre,titleSet,per,loc,org,other): fin = codecs.open(inPre+f, encoding = 'utf-8') for line in fin: if len(line.strip().split("\t")) != 11: continue ID,url,title,source,created_at,authors,key_word,snippets,raw_text,\ h_tokens_ent,b_tokens_ent = line.strip().split("\t") # h_tokens,b_tokens,\ if title in titleSet: continue else: titleSet.add(title) if len(b_tokens_ent.split()) > MAX_BODY_LEN: continue h_tokens_ent = unidecode.unidecode(h_tokens_ent.strip()) b_tokens_ent = unidecode.unidecode(b_tokens_ent.strip()) #h = grep_ent_with_context(h_tokens_ent,per,loc,org,other) # fds_per_| asked me about ... #b = grep_ent_with_context(b_tokens_ent,per,loc,org,other) h = grep_ent(h_tokens_ent,per,loc,org,other) # fsd_per_| oregon_loc_| ... b = grep_ent(b_tokens_ent,per,loc,org,other) h = rep2.sub('', h) b = rep2.sub('', b) h = my_tokenizer(h, tokenizer) b = my_tokenizer(b, tokenizer) tokens = h+' '+h+' '+b # title twice yield tokens.lower(),bk.News(ID,title,raw_text,snippets,key_word,source,created_at,f.split('.')[0],h_tokens_ent,b_tokens_ent) # can also leave lowercase to scikit fin.close()
def path(start, end): start =unidecode.unidecode(start) end = unidecode.unidecode(end) print start print end path = find_path(start, end) return jsonify(path=path)
def attributeUUID(context): sql_id = getattr(context, 'sql_id', None) try: sql_id = str(unidecode(str(sql_id))) except: sql_id = str(unidecode(sql_id)) return str(getattr(context, 'portal_type', ''))+'-'+sql_id
def do_the_trick(): print "\n\t-->> [Collecting]" global dot global samples dot, samples = do_fetch() if len(samples)>0: print "\n\t-->> [Playing]:" for ind_s, s in enumerate(samples): print "\n<.%s.>" % s['text'] #threat msg for spacing and tokenizing for j,k in enumerate(kws): if unidecode(k).lower() in unidecode(s['text']).lower(): newTweet = M.tweetmetanalyze(unidecode(s['text'])) ste = newTweet print "U:", ste #here, send osc try: cmnd = OSC.OSCMessage("/tweet") cmnd.append(ste) cmnd.append(ind_s) oscClient.send(cmnd) cmnd = OSC.OSCMessage("/palabra") cmnd.append(categorias_emotivas[gesto_to_class[k]]) cmnd.append(gesto_to_inten[k]) oscClient_ari.send(cmnd) except: print '\n\tAquí le falló\n\t' sleep(randint(1,5))
def normalize_unicode(s): if type(s) == bytearray: return unidecode(unicode(str(s), 'utf-8')).lower() elif type(s) == unicode: return unidecode(s).lower() else: return unidecode(unicode(s, 'utf-8')).lower()
def parseCISI(title, tmdb_title = None): movs = search(title) mov = None mov_id = None imdb_id = None year = None ss = [] sel = 'n' if movs is not None and len(movs) > 0: for m in movs: cisi_title = unidecode(m['title']).replace(',', '') if cisi_title.lower() == title.lower(): sel = 'y' break elif title.lower() in cisi_title.lower() or cisi_title.lower() in title.lower(): sel = raw_input( "Matching '{}' with canistream.it '{}' ({})... OK? [y or n] ".format( title , cisi_title , m['_id'] ) ).lower() if sel == 'y': break print("Trying again...") elif tmdb_title is not None: movs = search(tmdb_title) sel = 'n' if movs is not None and len(movs) > 0: for m in movs: cisi_title = unidecode(m['title'].decode('utf-8')) if cisi_title.lower() == tmdb_title.lower(): sel = 'y' break elif tmdb_title.lower() in cisi_title.lower() or cisi_title.lower() in tmdb_title.lower(): sel = raw_input( "Matching TMDB '{}' with canistream.it '{}' ({})... OK? [y or n] ".format( tmdb_title , cisi_title , m['_id'] ) ).lower() if sel == 'y': break else: print("Trying again...") if sel == 'y': mov = m mov_id = str(m['_id']) year = int(m['year']) if 'imdb' in m['links'].keys(): imdb_id = str(m['links']['imdb'].split("/")[-2]) else: print("Unable to find match in canistream.it for '{}'".format(title)) if mov is not None: ss = getStreams(mov_id) print("* MATCHED canistream.it") elif tmdb_title is not None: print("Streaming availability won't be available.") return mov_id, year, ss, imdb_id
def get_absolute_url(self): return('nhom', (), { 'monhoc': slugify(unicode(unidecode(self.mon_hoc.ten_mon))), 'monhocid': self.mon_hoc_id, 'nhom': slugify(unicode(unidecode(self.ten_nhom))), 'nhomid': self.pk, })
def getIndividualSubject(roster_semester,subject): url = COURSE_ROSTER_API_CLASSES + roster_semester + '&subject=' + subject soup = BeautifulSoup(requests.get(url).text) classes = soup.find_all('class') for c in classes: listing = subject + c.find('catalognbr').text if listing not in COURSE_DICT: name = unidecode(c.find('titlelong').text.replace('\n', ' ')) units_min = c.find('unitsminimum').text units_max = c.find('unitsmaximum').text if units_min == units_max: credits = units_min else: credits = units_min + "-" + units_max course_obj = Course(listing,name,credits) course_obj.description = unidecode(c.find('description').text.replace('\n', ' ')) course_obj.offered = unidecode(c.find('catalogwhenoffered').text.replace('\n', ' ')) course_obj.prerequisites = unidecode(c.find('catalogprereqcoreq').text.replace('\n', ' ')) course_obj.arts_tags = unidecode(c.find('catalogdistr').text.replace('\n', ' ')) crosslists = [] for combination in c.find_all('combination'): crosslists.append(combination.find('subject').text + combination.find('catalognbr').text) course_obj.crosslisted_classes = ";".join(crosslists) COURSE_DICT[listing] = course_obj print str(course_obj) print '-' * 50
def js_signature(self, input_map=False): """Returns a the javascript signature for a prediction method. """ objective_field = self.tree.fields[self.tree.objective_id] if not 'CamelCase' in objective_field: camelcase = to_camel_js(unidecode(objective_field['name']), False) objective_field['CamelCase'] = camelcase output = u"function predict%s(" % objective_field['CamelCase'] args = [] if len(self.tree.fields) > MAX_ARGS_LENGTH or input_map: args.append("data") else: for field in [(key, val) for key, val in sort_fields(self.tree.fields)]: field_obj = self.tree.fields[field[0]] if not 'camelCase' in field_obj: field_obj['camelCase'] = to_camel_js( \ unidecode(field_obj['name'])) if field[0] != self.tree.objective_id: args.append(u"%s" % field_obj['camelCase']) args_string = u", ".join(args) output += args_string + u")" return output
def plug_in(self, out=sys.stdout, hadoop=False, filter_id=None, subtree=True): """Generates a basic javascript implementation of local predictions `out` is file descriptor to write the javascript code. """ # fill the camelcase variable names with the JS_KEYWORDS restrictions objective_field = self.tree.fields[self.tree.objective_id] camelcase = to_camel_js(unidecode(objective_field['name']), False) objective_field['CamelCase'] = camelcase for field in [(key, val) for key, val in sort_fields(self.tree.fields)]: field_obj = self.tree.fields[field[0]] field_obj['camelCase'] = to_camel_js(unidecode(field_obj['name'])) body, term_analysis_predicates, item_analysis_predicates = \ self.tree.plug_in_body() terms_body = "" items_body = "" if term_analysis_predicates: terms_body = self.js_term_analysis_body(term_analysis_predicates) if item_analysis_predicates: items_body = self.js_item_analysis_body(item_analysis_predicates) output = self.js_pre_body() output += terms_body + items_body + body output += u"%sreturn null;\n}\n" % INDENT if not PY3: output = output.encode("utf8") out.write(output) out.flush()
def test_surrogate_pairs(self): # same character, written as a non-BMP character and a # surrogate pair s = _u('\U0001d4e3') # Note: this needs to be constructed at run-time, otherwise # a "wide" Python seems to optimize it automatically into a # single character. s_sp_1 = _u('\ud835') s_sp_2 = _u('\udce3') s_sp = s_sp_1 + s_sp_2 if sys.version_info < (3,4): self.assertEqual(s.encode('utf16'), s_sp.encode('utf16')) else: self.assertEqual(s.encode('utf16'), s_sp.encode('utf16', errors='surrogatepass')) wlog = WarningLogger() wlog.start("Surrogate character") a = unidecode(s) a_sp = unidecode(s_sp) self.assertEqual('T', a) # Two warnings should have been logged self.assertEqual(2, len(wlog.log)) wlog.stop()
def toascii(s): from unidecode import unidecode import codecs if isinstance(s,str): return unidecode(codecs.decode(s, 'utf-8')) elif isinstance(s,list): return map(lambda x:unidecode(codecs.decode(x, 'utf-8')),s)
def _fetch_artwork_info(image_id, page_url): """ Scrape the artwork info page for relevant properties to return dict. """ r = _get_response(page_url) soup = bs4.BeautifulSoup(r.text, "lxml") info = {} for tag in soup.find_all(lambda _tag: _tag.has_attr('itemprop')): itemprop_name = tag.attrs['itemprop'] if itemprop_name == 'keywords': keywords = [x.strip().strip(',').lower() for x in tag.strings] print keywords keywords = [x for x in keywords if x != ''] info[itemprop_name] = keywords elif tag.name == 'img': # itemprop='image' info[itemprop_name] = tag['src'].split('!')[0] else: # TODO: parse itemprop='name' differnetly, # as threre are 2 names: for artist and for artwork info[itemprop_name] = unidecode(tag.text.strip().lower()) for tag in soup.find_all('div', attrs={'class': 'info-line'}): strings = [unidecode(x).lower() for x in tag.stripped_strings] if len(strings) == 0: continue if strings[0] == 'style:': info['style'] = '$'.join(strings[1:]) elif strings[0] == 'media:': info['media'] = map(lambda s: s.strip(','), strings[1:]) info['media'] = [x for x in info['media'] if len(x) > 0] elif strings[0] == 'location:': info['location'] = '$'.join(strings[1:]) return info
def xml_to_txt(full_path, docType, docNumTag, sectionTag, secNumTag, secValueTag, noteTag): soup = Soup(open(full_path, "rb").read()) docnum = soup.find(docNumTag).text.replace("Title ", "") for section in soup.findAll(sectionTag): parents = [x.name for x in section.parents] if noteTag not in parents: sec = "" try: sec = section.find(secNumTag) sec = unidecode(sec[secValueTag]) except: sec = unidecode(section.find(secNumTag).text).replace("SS", "§").strip(" ").strip(".").split(" ")[-1]#.replace("Sec. ", "") sec = sec.replace(" to ", "_to_") sec = sec.replace(" through ", "_to_") sec = sec.strip() outpath = "../output/xml/{}/{}/sections/".format(docType, docnum) if not os.path.exists(outpath): os.makedirs(outpath) filepath = "{}{}.txt".format(outpath, sec) if not os.path.exists(filepath): f = open(filepath, 'w', encoding="utf-8") f.write(str(section)) print(" --XML added to directory {}".format(filepath))
def create_station(genre, phone_number): station = Station.objects.create() account, created = Account.objects.get_or_create(phone_number=phone_number) account.stations.add(station) account.current_station = station account.save() artists = get_artists(genre) client = soundcloud.Client(client_id=settings.SOUNDCLOUD_CLIENT_ID) for artist in artists: tracks = client.get('/tracks', q=artist['name'], streamable=True) for track in tracks: if track.stream_url and track.streamable and track.duration < MAX_DURATION: try: song = Song.objects.create( title=unidecode(track.title), sid=track.id, genre=unidecode(track.genre) ) station_song = StationSong.objects.create( song=song, index=station.songs.count() ) station.songs.add(station_song) station.save() break except: print "SoundCloud fail >.<" + str(track) return station.pk
def _alignBySplittingToken(self, tag, word, t_iter): # alignment helper self.logger.debug('tag %s exceeds word %s', repr(tag.word), repr(word)) tmp = list(tag) words = [word] asciis = [unidecode(word).replace('-', '')] tag_word = ''.join(self.tokenizer.split(tag.word)) aligned = lambda: ''.join(asciis) == tag_word max_len = len(tag_word) aligned_tags = [] while not aligned() and sum(map(len, asciis)) < max_len: words.append(next(t_iter)) asciis.append(unidecode(words[-1]).replace('-', '')) if aligned(): self.logger.debug('dropping tag %s [%s] for words "%s"', repr(tag.word), tag[-1], ' '.join(words)) for w, a in zip(words, asciis): tmp[0] = w tmp[1] = a self.logger.debug('adding tag %s [%s]', repr(w), tmp[-1]) aligned_tags.append(Token(*tmp)) for p in (3, 4): if tmp[p].startswith('B-'): tmp[p] = 'I' + tmp[p][1:] else: raise RuntimeError('alignment of words %s as %s to token "%s" as "%s" failed' % ( repr(words), repr(asciis), tag.word, tag_word )) return aligned_tags
def _sort_glossary(qresult, lang): """ Sort the result into categories and questions from response returned by the backend engine """ glossary_content = [] letters = [] letters_found = OrderedDict() field = "glossary_term_lang_" + lang for i in string.ascii_uppercase: letters.append(i) letters_found[i] = 0 if len(qresult) > 0: # Process results from itertools import groupby items = [o.get_stored_fields() for o in qresult] items = sorted(items, key=lambda x: unidecode(x[field])) for k, g in groupby(items, key=lambda x: unidecode(x[field])[0]): letters_found[k] = 1 glossary_content.append( { "letter": k, "terms": [ {"term": item[field], "description": item["glossary_description_lang_" + lang]} for item in g ], } ) return letters, letters_found, glossary_content
def fast_iter(context, func,*args, **kwargs): collaborations = [u'www', u'phdthesis', u'inproceedings', u'incollection', u'proceedings', u'book', u'mastersthesis', u'article'] #xml categories author_array = [] title = '' #read chunk line by line #we focus author and title for event, elem in context: if elem.tag == 'author': author_array.append(unidecode(elem.text)) if elem.tag == 'title': if elem.text: title = unidecode(elem.text) if elem.tag in collaborations: if len(author_array) is not 0 and title is not '': #rejected paper has no author or title #it should be check for a in author_array: func(a+"||"+title, *args, **kwargs) #write into kv file title = '' del author_array[:] elem.clear() while elem.getprevious() is not None: del elem.getparent()[0] del context
def scrape_wikitables(): """Scrapes wikipedia for the list of current top boxers""" champURL = "https://en.wikipedia.org/wiki/List_of_current_boxing_rankings" page = urllib.request.urlopen(champURL) soup = bs4.BeautifulSoup(page, "html5lib") tables = soup.find_all("table", {"class": "wikitable"}) unique_boxers = [] for table_number in range(1, 6): table = tables[table_number] rows = table.find_all("tr") for row in rows: data = row.find_all("td") text = [i.text for i in data] for boxer_name in range(len(text)): if len(text[boxer_name]) > 3: boxer_name = text[boxer_name].rstrip('\n') boxer_name = re.findall(r"\S{3,}\ .[^\ \(]+", boxer_name) if len(boxer_name) > 0: if unidecode(boxer_name[0]) not in unique_boxers: unique_boxers.append(unidecode(boxer_name[0])) unique_boxers.sort() return unique_boxers
def get_forms(c): global forms c.execute("SELECT DISTINCT species_id FROM pokemon WHERE id IN (SELECT pokemon_id FROM pokemon_forms WHERE form_identifier != 'NULL' ORDER BY pokemon_id) ORDER BY species_id") species_ids = c.fetchall() for i in range(len(species_ids)): c.execute("SELECT name FROM pokemon_species_names WHERE pokemon_species_id=%d AND local_language_id=9" % species_ids[i][0]) species_name = str(unidecode(c.fetchone()[0])).replace("-","_").replace(" ","_").replace(".","").replace("'","") c.execute("SELECT pokemon_form_id,form_name FROM pokemon_form_names WHERE pokemon_form_id IN (SELECT id FROM pokemon_forms WHERE pokemon_id IN (SELECT id FROM pokemon WHERE species_id=%s)) AND local_language_id=9" % species_ids[i][0]) species_forms = c.fetchall() form_index = [] form_index += [species_name] for j in range(len(species_forms)): form_name = "STANDARD" if species_forms[j][1] == None else str(unidecode(species_forms[j][1])).replace("-","_").replace(" ","_").replace(".","").replace("'","").upper() form_name = form_name.replace("_FORME","").replace("_FORM","").replace("_TYPE","").replace("_ROTOM","").replace("???","QUESTION_MARK").replace("!","EXCLAMATION_MARK") form_name = form_name.replace("?","QUESTION_MARK").replace("_PATTERN","").replace("_KYUREM","").replace("_MODE","") if "MEGA" in form_name and "_X" in form_name: form_name = "MEGA_X" elif "MEGA" in form_name and "_Y" in form_name: form_name = "MEGA_Y" elif "MEGA" in form_name: form_name = "MEGA" form_index += [(species_forms[j][0], form_name)] forms += [form_index]
def parse_authors(self): # Create authors print "Parsing Authors..." f = open(data_io.get_paths()["author_processed_path"], "r") titles = f.readline() for l in f.readlines(): res = l.strip().split(",") # Titles raw_title = unidecode.unidecode(unicode(res[1], encoding="utf-8")) (name, surname) = nlp.filter_title(raw_title) try: self.surnames[surname] = self.surnames[surname] + 1 except: self.surnames[surname] = 1 #Affiliations raw_affiliation = unidecode.unidecode(unicode(res[2], encoding="utf-8")) affiliation = nlp.filter_affiliation(raw_affiliation) try: self.affiliations[affiliation] = self.affiliations[affiliation] + 1 except: self.affiliations[affiliation] = 1 self.authors[int(res[0])] = author.Author(int(res[0]), name, surname, affiliation) print "Done" f.close()
def main(): URL_SENTIMENT140 = 'http://www.sentiment140.com/api/[email protected]' tweets = [] for line in sys.stdin: try: tweetData = json.loads(line.decode('utf-8')) location = tweetData['user']['location'].strip() if location is None or bool(re.search(r'\d',location)): location = 'unknown' tempDataDict = {'text': unidecode(tweetData['text']), 'location':\ unidecode(location.upper())} tweets.append(tempDataDict) except: continue dataToSend = {'data': tweets} try: response = urllib2.urlopen(URL_SENTIMENT140, str(dataToSend)) sentimentJsonResponse = json.loads(response.read()) parsedDataDict = parseResponse(sentimentJsonResponse) for key, value in parsedDataDict.items(): print "{0}\t{1}".format(key, value) except HTTPError as e: print 'The server couldn\'t fulfill the request.' print 'Error code: ', e.code except URLError as e: print 'We failed to reach a server.' print 'Reason: ', e.reason except: print 'response from server is null or some error has occured'
def get_map_pointers(self): pointers = {} projects = Project.objects.all() collection_points = CollectionPoint.objects.all() marker_project = 'markers/project.png' marker_collection = 'markers/collection.png' if projects: for project in projects: name = project.name name = unidecode(re.sub(r'[^a-zA-Z_]', '', name)) pointers[name] = { 'latitude': project.latitude, 'longitude': project.longitude, 'url': project.get_absolute_url(), 'marker': marker_project, } if collection_points: for point in collection_points: name = u'{}{}'.format(point.name, "_c") name = unidecode(re.sub(r'[^a-zA-Z_]', '', name)) pointers[name] = { 'latitude': point.latitude, 'longitude': point.longitude, 'url': reverse('addresses'), 'marker': marker_collection, } return pointers
def cleanText(text): "Removes punctuation and replaces spaces with underscores" text = unidecode(text) text = re.sub("[^A-Za-z0-9 _]", "", text) text = text.replace(" ", "_").lower() return text
def save(self, *args, **kwargs): if not self.id or not self.slug: self.slug = slugify(unidecode(self.title)) super().save(*args, **kwargs)
def generate_json(input_file, output_path): if not is_valid_txt_file(input_file): print(f'Skipping input file ({input_file}) as it is not a txt file') return global pk if 'american_football' in input_file: sport_name = 'American Football' elif 'basketball' in input_file: sport_name = 'Basketball' elif 'table_tennis' in input_file: sport_name = 'Table Tennis' elif 'tennis' in input_file: sport_name = 'Tennis' elif 'football' in input_file: sport_name = 'Football' elif 'golf' in input_file: sport_name = 'Golf' elif 'cricket' in input_file: sport_name = 'Cricket' elif 'boxing' in input_file: sport_name = 'Boxing' elif 'rugby' in input_file: sport_name = 'Rugby' elif 'motorsports' in input_file: sport_name = 'Motorsports' elif 'combatsport' in input_file: sport_name = 'Combat Sports' elif 'baseball' in input_file: sport_name = 'Baseball' elif 'hockey' in input_file: sport_name = 'Hockey' else: print(f'Skipping input file ({input_file}) as the file name does provide a match with the known list of sports ' f'which would break the relation field in the category model to a sport row in the db as it requires a ' f'valid natural key') return try: f = open(input_file, encoding="utf-8") except FileNotFoundError as err: print(str(err)) data = [] date = '2019-09-01T00:00:00.000Z' line = f.readline() while line: unaccented_string = unidecode.unidecode(line[:-1]) if line.endswith('\n') else unidecode.unidecode(line) term = { 'model': 'dictionary.category', 'pk': pk, 'fields': { 'name': unaccented_string, 'sport': (sport_name,) } } data.append(term) pk = pk + 1 line = f.readline() f.close() json_file = os.path.splitext(input_file)[-2] + '.json' json_file_path = output_path + json_file with io.open(json_file_path, 'w', encoding='utf8') as json_file: json.dump(data, json_file, ensure_ascii=False, indent=4)
def format_str(str): return unidecode(str.replace('"', ''))
# -*- coding: utf-8 -*- import unicodedata import string from unidecode import unidecode def remove_accents(data): return ''.join(x for x in unicodedata.normalize('NFKD', data) if x in string.ascii_letters or x == '_').lower() def remove_accents2(data): return filter(lambda char: char in string.ascii_uppercase, data.upper()) s = 'Números_distântes'.encode('utf8') print(s) print(unidecode(s.decode('utf-8')))
def _serialize_lote_rps(self, dados_lote_rps, dados_servico): dados_tomador = self._prepare_dados_tomador() return tpRPS( Assinatura=self.assinatura_rps(dados_lote_rps, dados_servico, dados_tomador), ChaveRPS=tpChaveRPS( InscricaoPrestador=self.convert_type_nfselib( tpChaveRPS, 'InscricaoPrestador', dados_lote_rps['inscricao_municipal'].zfill(8)), SerieRPS=self.convert_type_nfselib( tpChaveRPS, 'SerieRPS', dados_lote_rps['serie']), NumeroRPS=self.convert_type_nfselib( tpChaveRPS, 'NumeroRPS', dados_lote_rps['numero']), ), TipoRPS=self._map_type_rps(dados_lote_rps['tipo']), DataEmissao=self.convert_type_nfselib( tpRPS, 'DataEmissao', dados_lote_rps['data_emissao'].split('T', 1)[0]), StatusRPS=self.convert_type_nfselib(tpRPS, 'StatusRPS', 'N'), TributacaoRPS=self.convert_type_nfselib( tpRPS, 'TributacaoRPS', self._map_taxation_rps(dados_lote_rps['natureza_operacao'])), ValorServicos=self.convert_type_nfselib( tpRPS, 'ValorServicos', dados_servico['valor_servicos']), ValorDeducoes=self.convert_type_nfselib( tpRPS, 'ValorDeducoes', dados_servico['valor_deducoes']), ValorPIS=self.convert_type_nfselib( tpRPS, 'ValorPIS', dados_servico['valor_pis']), ValorCOFINS=self.convert_type_nfselib( tpRPS, 'ValorCOFINS', dados_servico['valor_cofins']), ValorINSS=self.convert_type_nfselib( tpRPS, 'ValorINSS', dados_servico['valor_inss']), ValorIR=self.convert_type_nfselib( tpRPS, 'ValorIR', dados_servico['valor_ir']), ValorCSLL=self.convert_type_nfselib( tpRPS, 'ValorCSLL', dados_servico['valor_csll']), CodigoServico=self.convert_type_nfselib( tpRPS, 'CodigoServico', dados_servico['codigo_tributacao_municipio']), AliquotaServicos=self.convert_type_nfselib( tpRPS, 'AliquotaServicos', dados_servico['aliquota']), ISSRetido='true' if dados_servico['iss_retido'] == '1' else 'false', # FIXME: Hardcoded CPFCNPJTomador=self.convert_type_nfselib( tpRPS, 'CPFCNPJTomador', tpCPFCNPJ( CNPJ=dados_tomador['cnpj'], CPF=dados_tomador['cpf'])), InscricaoMunicipalTomador=self.convert_type_nfselib( tpRPS, 'InscricaoMunicipalTomador', dados_tomador['inscricao_municipal']), InscricaoEstadualTomador=self.convert_type_nfselib( tpRPS, 'InscricaoEstadualTomador', dados_tomador['inscricao_estadual']), RazaoSocialTomador=self.convert_type_nfselib( tpRPS, 'RazaoSocialTomador', dados_tomador['razao_social']), EnderecoTomador=tpEndereco( Logradouro=self.convert_type_nfselib( tpEndereco, 'Logradouro', dados_tomador['endereco']), NumeroEndereco=self.convert_type_nfselib( tpEndereco, 'NumeroEndereco', dados_tomador['numero']), ComplementoEndereco=self.convert_type_nfselib( tpEndereco, 'ComplementoEndereco', dados_tomador['complemento']), Bairro=self.convert_type_nfselib( tpEndereco, 'Bairro', dados_tomador['bairro']), Cidade=self.convert_type_nfselib( tpEndereco, 'Cidade', dados_tomador['codigo_municipio']), UF=self.convert_type_nfselib( tpEndereco, 'UF', dados_tomador['uf']), CEP=self.convert_type_nfselib( tpEndereco, 'CEP', dados_tomador['cep']), ), EmailTomador=self.convert_type_nfselib( tpRPS, 'EmailTomador', dados_tomador['email']), Discriminacao=self.convert_type_nfselib( tpRPS, 'Discriminacao', unidecode( dados_servico['discriminacao'] + ( '|%s|' % self.fiscal_additional_data.replace( '\n', '|' ) if self.fiscal_additional_data else '')) ), ValorCargaTributaria=self.convert_type_nfselib( tpRPS, 'ValorCargaTributaria', dados_lote_rps['carga_tributaria']), FonteCargaTributaria=self.convert_type_nfselib( tpRPS, 'FonteCargaTributaria', dados_lote_rps['total_recebido']), MunicipioPrestacao=self.convert_type_nfselib( CabecalhoType, 'Versao', self._map_provision_municipality( dados_lote_rps['natureza_operacao'], dados_servico['codigo_municipio'] ) ), )
def remove_non_ascii(text): return unidecode(text)
name_to_find = node_in_file.split(',')[1].split('\n')[0].replace('_', ' ').title() id_of_searched = int(node_in_file.split(',')[0]) print(name_to_find) print(id_of_searched) print(list_of_nodes_to_add) print(list_of_edges_to_add) # skip on passed ids if id_of_searched < position: continue co_authors = AW.all_co_authors(name_to_find) # run on every publications and adding it to "nodes_by_lines" print(co_authors) for co_author in co_authors: author = unidecode(co_author.lower()) author = ''.join([i for i in author if (i.isalpha()) | (i == ' ') | (i == '-') | (i == '.') | (i == "'") | (i == "(") | (i == ")")]) author_splitted = author.split(' ') # normalizing names normalized_author = "" if Got_Here: print("Got Here0") for i in range(len(author_splitted)): if (not author_splitted[i].isnumeric()): normalized_author += author_splitted[i] + '_' normalized_author = normalized_author[:len(normalized_author)-1] normalized_splitted = normalized_author.split('_') if Got_Here: print("Got Here1") # find first and last name which not contain '.' in them
def main(): text = easygui.codebox("Enter the box-level inventory below.", "Enter Data") if not text or text.strip() == "": return # Remove potential whitespace at beginning and end text = unidecode.unidecode(text.strip()) # First get settings header if it exists json_str = re.findall("^{.*}", text) json_str = json_str[0] if json_str else None opts = options.get_options(json_str) text = re.sub("^{.*}", "", text).strip() lines = text.split('\n') # Get first box number, first by trying to find it, then by inputbox if not first_num = re.findall(r'^[\s\n]*\d{1,3}', text) first_num = int(first_num[0].strip()) if first_num else easygui.integerbox( "Enter the box number.", "Enter Box Number", lowerbound=1, upperbound=999) first_num = first_num if first_num else 1 entries = [] last_entry = None extras = 0 for l in lines: l = re.sub(r'\s+', " ", l).strip() # Perform user substitutions per line if opts["user_regex"] is not None and opts["user_subst"] is not None: l = opts["user_regex"].sub(opts["user_subst"], l) if re.match(r'^\d+(?:\-\d+)?\s', l) and last_entry: entries.append(last_entry) for _ in range(extras): entries.append(last_entry) last_entry = None extras = 0 if re.match(r'^\d+(?:\-\d+)?\s', l): while re.match(r'^\d+(?:\-\d+)?\s', l): # If we have a range, determine the range rng = re.findall(r'^\d+\-\d+\s', l) if rng: rng = [int(x) for x in rng[0].strip().split('-')] extras = rng[1] - rng[0] l = re.sub(r'^\d+(?:\-\d+)?\s', "", l).strip() last_entry = l else: last_entry += "\n" + l if last_entry: entries.append(last_entry) for _ in range(extras): entries.append(last_entry) contents = "" for i, e in enumerate(entries, start=first_num): e = e.strip() # Remove unnecessary leading/trailing space contents += "box\t" contents += "Box " + str(i) # Get all years from current entry. The replace business will change a # year like '66 to 1966 or '01 to 2001 (19/20 dependent on current year) years = [ y.replace("'", "19" if int(y[1:]) > datetime.now().year % 100 else "20") if len(y) < 4 else y for y in re.findall(r"(?:[1-2]\d{3}|'\d\d)", e) ] years = [ int(y) for y in years if opts["min_year"] <= int(y) <= datetime.now().year ] if years: if len(years) >= 2: # Pick the min and max (for out of order years or many years) years = [min(years), max(years)] # If they're the same, just make the second one blank if years[0] == years[1]: years[1] = "" if len(years) == 1: # Add a blank entry so we have a tab still years.append("") contents += "\t" + "\t".join(str(y) for y in years) + "\t" else: contents += "\t\t\tn.d." contents += "\t" + str(i) + "\tbox\t\t" contents += '"' + e.replace('"', '""') + '"' contents += "\n" pyperclip.copy(contents[:-1]) easygui.msgbox("Copied to clipboard.", "Done")
def _is_field_common_name(self, field: str) -> bool: field = unidecode(field).upper() return field in self.FIELD_NAMES
def slugify(title): return re.sub(r'\W+', '-', str(unidecode.unidecode(title)).lower()).strip(' -')
def slugify(str, separator='_'): str = unidecode.unidecode(str).lower().strip() return re.sub(r'\W+', separator, str).strip(separator)
def lhandler(default, toconf, show_header=True): if show_header: print( "We will now ask you to provide the list of languages you want to use." ) print( "Please list all the desired languages, comma-separated, using ISO 639-1 codes. The first language will be used as the default." ) print( "Type '?' (a question mark, sans quotes) to list available languages." ) answer = ask('Language(s) to use', 'en') while answer.strip() == '?': print('\n# Available languages:') try: print(SAMPLE_CONF['_SUPPORTED_LANGUAGES'] + '\n') except UnicodeEncodeError: # avoid Unicode characters in supported language names print( unidecode.unidecode( SAMPLE_CONF['_SUPPORTED_LANGUAGES']) + '\n') answer = ask('Language(s) to use', 'en') langs = [ i.strip().lower().replace('-', '_') for i in answer.split(',') ] for partial, full in LEGAL_VALUES[ '_TRANSLATIONS_WITH_COUNTRY_SPECIFIERS'].items(): if partial in langs: langs[langs.index(partial)] = full print("NOTICE: Assuming '{0}' instead of '{1}'.".format( full, partial)) default = langs.pop(0) SAMPLE_CONF['DEFAULT_LANG'] = default # format_default_translations_config() is intelligent enough to # return the current value if there are no additional languages. SAMPLE_CONF['TRANSLATIONS'] = format_default_translations_config( langs) # Get messages for navigation_links. In order to do this, we need # to generate a throwaway TRANSLATIONS dict. tr = {default: ''} for l in langs: tr[l] = './' + l # Assuming that base contains all the locales, and that base does # not inherit from anywhere. try: messages = load_messages(['base'], tr, default, themes_dirs=['themes']) SAMPLE_CONF['NAVIGATION_LINKS'] = format_navigation_links( langs, default, messages, SAMPLE_CONF['STRIP_INDEXES']) except nikola.utils.LanguageNotFoundError as e: print(" ERROR: the language '{0}' is not supported.".format( e.lang)) print( " Are you sure you spelled the name correctly? Names are case-sensitive and need to be reproduced as-is (complete with the country specifier, if any)." ) print( "\nType '?' (a question mark, sans quotes) to list available languages." ) lhandler(default, toconf, show_header=False)
def remove_nonunicode(tweet): return bytes(unidecode.unidecode(tweet), 'utf-8').decode('utf-8', 'ignore')
# coding: utf-8 # In[3]: from unidecode import unidecode import pandas as pd scores = pd.read_csv("Score.csv") # remove all the accents for i in range(len(scores)): scores.LastName[i] = unidecode(unicode(scores.LastName[i])) scores.FirstName[i] = unidecode(unicode(scores.FirstName[i])) scores.to_csv("Score_no_accent.csv")
def read_metadata(xml_file, module_id, package_type, year_str, month_str, month_str_ar): metadata = {} mun_name = Municipality.query.filter_by( municipal_id=current_user.municipal_id).first() tree = et.parse(xml_file) soup = tree.getroot() for item in soup.findall('module'): if int(item.get('id')) == module_id: for pack in item.find('packages').findall('package'): if package_type == pack.get('id'): metadata = { "name": pack.find('name').text.replace( 'mun_name', unidecode.unidecode( mun_name.municipal_name)).lower().replace( ' ', '-'), "title": pack.find('title').text.replace( 'mun_name', mun_name.municipal_name), "title_ar": pack.find('title_ar').text.replace( 'mun_name_ar', mun_name.municipal_name_ar), "notes": pack.find('notes').text.replace( 'mun_name', mun_name.municipal_name), "notes_ar": pack.find('notes_ar').text.replace( 'mun_name_ar', mun_name.municipal_name_ar), "frequency_update": pack.find('frequency_update').text, "keywords": { "ar": [ _.replace('mun_name_ar', '') for _ in pack.find('keywords_ar').text.split(',') ], "fr": [ _.replace('mun_name', mun_name.municipal_name) for _ in pack.find('keywords_fr').text.split( ',') ] }, "author": current_user.name + ' ' + current_user.last_name, "author_email": current_user.email, "maintainer": current_user.name + ' ' + current_user.last_name, "maintainer_email": current_user.email, "owner_org": mun_name.ckan_id, "private": False, "license_id": 'cc-by', "groups": [{ 'name': pack.find('groups').text }], "resources": [] } for res in pack.find('resources').findall('resource'): metadata['resources'].append({ 'description': res.find('description').text.replace( 'mun_name', mun_name.municipal_name).replace( 'YYYY', year_str).replace('MMMM', month_str), 'description_ar': res.find('description_ar').text.replace( 'mun_name_ar', mun_name.municipal_name_ar).replace( 'YYYY', year_str).replace('MMMM_ar', month_str_ar), 'name': res.find('name').text.replace( 'mun_name', mun_name.municipal_name).replace( 'YYYY', year_str).replace('MMMM', month_str), 'name_ar': res.find('name_ar').text.replace( 'mun_name_ar', mun_name.municipal_name_ar).replace( 'YYYY', year_str).replace('MMMM_ar', month_str_ar), 'format': res.find('format').text, 'type': res.get('id') }) break break return metadata
def convert_name(name): ascii_name = unidecode(name.replace(" ", "_")).lower().replace(" ", "-") return re.sub('-_', '_', ascii_name)
def __call__(self, instance, filename): filename, ext = filename.rsplit('.', 1) filename = re.sub(r'[_.,:;@#$%^&?*|()\[\]]', '-', filename) filename = slugify(unidecode(smart_text(filename))) full_filename = '.'.join([filename, ext]) return os.path.join(self.sub_path, full_filename)
def urlify(s): s = re.sub(r"[^\w\s\-]", '', s) s = re.sub(r"\s+", '-', s).lower() return unidecode(s)
def task_submit_post(request, task_id, submit_type): """Spracovanie uploadnuteho submitu""" try: submit_type = int(submit_type) except ValueError: raise HttpResponseBadRequest # Raise Not Found when submitting non existent task task = get_object_or_404(Task, pk=task_id) # Raise Not Found when submitting non-submittable submit type if not task.has_submit_type(submit_type): raise Http404 # Raise Not Found when not submitting through POST if request.method != "POST": raise Http404 try: sfile = request.FILES["submit_file"] except: # noqa: E722 @FIXME # error will be reported from form validation pass # File will be sent to tester if (submit_type == constants.SUBMIT_TYPE_SOURCE or submit_type == constants.SUBMIT_TYPE_TESTABLE_ZIP): if submit_type == constants.SUBMIT_TYPE_SOURCE: form = SourceSubmitForm(request.POST, request.FILES) else: form = TestableZipSubmitForm(request.POST, request.FILES) if form.is_valid(): if submit_type == constants.SUBMIT_TYPE_SOURCE: language = form.cleaned_data["language"] else: language = ".zip" # Source submit's should be processed by process_submit() submit_id = process_submit(sfile, task, language, request.user) if not submit_id: messages.add_message(request, messages.ERROR, "Nepodporovaný formát súboru") else: # Source file-name is id.data sfiletarget = unidecode( os.path.join( get_path(task, request.user), submit_id + constants.SUBMIT_SOURCE_FILE_EXTENSION, )) write_chunks_to_file(sfiletarget, sfile.chunks()) sub = Submit( task=task, user=request.user, submit_type=submit_type, points=0, filepath=sfiletarget, testing_status=constants.SUBMIT_STATUS_IN_QUEUE, protocol_id=submit_id, ) sub.save() if task.email_on_code_submit: send_notification_email(sub, task_id, submit_type) success_message = format_html( "Úspešne si submitol program, výsledok testovania nájdeš " '<a href="{}">tu</a>', reverse("view_submit", args=[sub.id]), ) messages.add_message(request, messages.SUCCESS, success_message) else: for field in form: for error in field.errors: messages.add_message(request, messages.ERROR, "%s: %s" % (field.label, error)) if "redirect_to" in request.POST and request.POST["redirect_to"]: return redirect(request.POST["redirect_to"]) else: return redirect( reverse("task_submit_page", kwargs={"task_id": int(task_id)})) # File won't be sent to tester elif submit_type == constants.SUBMIT_TYPE_DESCRIPTION: if request.user.is_competition_ignored( task.round.semester.competition): return HttpResponseForbidden() form = DescriptionSubmitForm(request.POST, request.FILES) if form.is_valid(): sfiletarget = get_description_file_path(sfile, request.user, task) write_chunks_to_file(sfiletarget, sfile.chunks()) sub = Submit( task=task, user=request.user, submit_type=submit_type, points=0, testing_status=constants.SUBMIT_STATUS_IN_QUEUE, filepath=sfiletarget, ) sub.save() if task.email_on_desc_submit: send_notification_email(sub, task_id, submit_type) if task.round.can_submit: messages.add_message( request, messages.SUCCESS, _("You have successfully submitted your description, " "it will be reviewed after the round finishes."), ) else: messages.add_message( request, messages.WARNING, _("You have submitted your description after the deadline. " "It is not counted in results."), ) else: for field in form: for error in field.errors: messages.add_message(request, messages.ERROR, "%s: %s" % (field.label, error)) if "redirect_to" in request.POST and request.POST["redirect_to"]: return redirect(request.POST["redirect_to"]) else: return redirect( reverse("task_submit_page", kwargs={"task_id": int(task_id)})) else: # Only Description and Source and Zip submitting is developed currently raise Http404
def safe_filename(accented_string): """ make a safe filename with no non-ascii chars """ return "".join([c for c in unidecode.unidecode(accented_string) \ if c.isascii() or c.isdigit() or c == ' ']).rstrip()
def get_valid_name(self, name): print 'It is called' return unidecode(name)
# for ff in ["label", "wikilinks"]: for ff in ["label"] + kFEATURES.keys(): print("Loading %s" % ff) feat = instantiate_feature(ff, qdb) if ff == "label": meta = open("features/expo/%s.meta" % flags.granularity, 'w') else: meta = None # Open the feature file for output filename = ("features/%s/%s.%s.feat" % ('expo', flags.granularity, ff)) print("Opening %s for output" % filename) o = open(filename, 'w') for page in questions: for qq in questions[page]: for ss, tt, pp, line in feature_lines(qq, guess_list, flags.granularity, feat): assert ff is not None o.write("%s\n" % line) if not meta is None: meta.write("%i\t%i\t%i\t%s\n" % (qq.qnum, ss, tt, unidecode(pp))) o.flush() o.close() print("Done with %s" % ff) # now that we're done with it, delete the feature del feat
def post(self, request): args = {} if request.method == 'POST': edit_control = self.request.POST.get('control') first_title = self.request.POST.get('first_title') form = PostForm(request.POST) print(form.is_valid()) if form.is_valid(): post = form.save(commit=False) if edit_control == "edit": post = Post.objects.get(author=request.user, title=first_title) post.title = form.cleaned_data['title'] post.category = form.cleaned_data['category'] post.body = form.cleaned_data['body'] post.allow_comments = form.cleaned_data['allow_comments'] text = unidecode.unidecode(post.title).lower() post.slug = re.sub(r'[\W_]+', '-', text) post.save(update_fields=[ 'title', 'category', 'body', 'allow_comments', 'slug' ]) PostListView.as_view()(self.request) return HttpResponseRedirect(post.get_absolute_url()) elif edit_control == "delete": Post.objects.get(author=request.user, title=first_title).delete() PostListView.as_view()(self.request) return render(request, self.template_name, { 'delete_info': True, 'first_title': first_title }) # add-post alanı try: exist_blog = Post.objects.get( title=form.cleaned_data['title']) return render(request, self.template_name, { 'exist_blog': exist_blog, 'post_form': form }) except Post.DoesNotExist: post.author = request.user post.publish = timezone.now() post.title = form.cleaned_data['title'] post.category = form.cleaned_data['category'] post.body = form.cleaned_data['body'] post.allow_comments = form.cleaned_data['allow_comments'] text = unidecode.unidecode(post.title).lower() post.slug = re.sub(r'[\W_]+', '-', text) form.save() PostListView.as_view()(self.request) return HttpResponseRedirect(post.get_absolute_url()) # else: print(form.errors) else: pass return PostListView.as_view()(self.request)
def generate_misspell_sample(query, n=25, max_edit_distance=4): tokens = query.split() if len(tokens) == 1: n = 15 elif len(tokens) == 2: n = 25 elif len(tokens) == 3: n = 35 else: n = 45 clenq = len(query) vv = clenq / 4 if vv == 0: vv = 1 max_edit_distance = int(min(vv, max_edit_distance)) results = set() for _ in range(n): actions = [random.choice(ACTIONS) for i in range(max_edit_distance)] qx = query for a in actions: clen = len(qx) pos = random.randint(0, clen - 1) if a == "DO_NOTHING": continue if a == "INSERT": rc = random.sample(CHARS, 1)[0] qx = qx[:pos + 1] + rc + qx[pos + 1:] elif a == "REMOVE": qx = qx[:pos] + qx[pos + 1:] elif a == "NORMAL_REPLACE": c = qx[pos] if c not in SAMPLING_MAP: qx = qx[:pos] + random.choice(CHARS) + qx[pos + 1:] else: qx = qx[:pos] + \ random.choice(SAMPLING_MAP[c]) + qx[pos + 1:] elif a == "TELEX_REPLACE": c = qx[pos] if c not in VN_TELEX: if c not in SAMPLING_MAP: qx = qx[:pos] + random.choice(CHARS) + qx[pos + 1:] else: qx = qx[:pos] + \ random.choice(SAMPLING_MAP[c]) + qx[pos + 1:] else: if pos < len(qx) - 1: nc = qx[pos + 1] telex_c = random.choice(VN_TELEX[c]) if nc != u" ": qx = qx[:pos] + telex_c[0] + \ nc + telex_c[1] + qx[pos + 2:] else: qx = qx[:pos] + telex_c[0] + \ telex_c[1] + qx[pos + 1:] elif a == "UNACCENT": tokens = qx.split() tc = random.randint(0, len(tokens) - 1) tokens[tc] = unidecode(tokens[tc]) qx = u" ".join(tokens) qx = u" ".join(qx.split()) if qx != query: results.add(qx) return results
# Get latest IDs because for some reason Papers and Works aren't autoincrementing paper_id = Papers.objects.latest("paperid").paperid + 1 author_id = Authors.objects.latest("authorid").authorid + 1 for article in articles: title = article.find(class_="articleTitle").get_text().strip() # Conference proceeding entries look like articles but don't have an articleauthors field try: authors = article.find(class_="articleAuthors").find_all("a") except AttributeError: continue authors_sql = [] for author in authors: # Sometimes names are accented, we avoid that by converting to unidecode author_string = unidecode(author.string) try: author_sql = Authors.objects.get(authorname=author_string) except Authors.DoesNotExist: author_sql = Authors.objects.create(authorname=author_string, authorid=author_id) author_id += 1 author_sql.save() authors_sql.append(author_sql) doi = article.find(class_="articleCitation").get_text().strip().split()[0] paper = Papers.objects.create(paperid=paper_id, title=title, doi=doi, numauthors=len(authors_sql)) paper_id += 1 paper.save() for author in authors_sql: work = Works.objects.create(authorid=author.authorid, paperid=paper.paperid) work.save()
def remove_tone(self, s): return unidecode.unidecode(s)
def get_missing_deets(url): print(url) r = requests.get(url) page = BeautifulSoup(r.content) missing_dict = {} for body in page.findAll('tbody'): if None != body.find('span').find('p'): if 'Missing' in body.find('span').find('p').text: ## prep to extract num = len(body.findAll('p')) if num > 1: txt = body.findAll('p')[1].text name = body.find('span').find('p').text.split( 'Missing Child: ')[1] else: txt = body.text name = page.find('title').text txt = txt.replace('\n', '') for i in range(10): txt = txt.replace(' ', ' ').strip(' ') for col in COLS: txt = txt.replace(col, ' :{}'.format(col)) txt = txt.strip(' ') ## get info about child tmp_dict = {'abductor': False} keys = txt.split(':')[1:][::2] values = [ unidecode(x).strip(' ') for x in txt.split(':')[1:][1::2] ] for i in range(len(keys)): tmp_dict.update( {keys[i].replace(' ', '_').lower(): values[i]}) ## update overall dict missing_dict.update({name: tmp_dict}) elif 'Abductor' in body.text: ## clean txt = unidecode(body.text.replace('\n', '')) for i in range(10): txt = txt.replace(' ', ' ').strip(' ') for col in COLS: txt = txt.replace(col, ' :{}'.format(col)) txt = txt.strip(' ') ## get info about abuductor tmp_dict = {'abductor': True} keys = txt.split(':')[1:][::2] values = [ unidecode(x).strip(' ') for x in txt.split(':')[1:][1::2] ] key_len = np.array([len(keys), len(values)]).min() for i in range(len(keys)): if keys[i] == 'Abductor': name = values[i] else: tmp_dict.update( {keys[i].replace(' ', '_').lower(): values[i]}) missing_dict.update({name: tmp_dict}) df = pd.DataFrame.from_dict(missing_dict).transpose().reset_index().rename( columns={'index': 'name'}) images = [ x.get('src') for x in page.findAll('img', src=True) if 'missing/kids/' in x.get('src') ] df['image'] = images[:len(df)] return df
open(flags.buzz, "w"), ["question", "sentence", "word", "page", "evidence", "final", "weight"], ) o_buzz.writeheader() o_final = DictWriter(open(flags.final, "w"), ["question", "answer"]) o_final.writeheader() for question in results: pos, guess = results[question] ss, tt = word_position_to_sent(questions, question, pos) for sent_offset, sent in enumerate(questions[question]): question_line = {} question_line["id"] = question question_line["answer"] = unidecode(answers[question]) question_line["sent"] = sent_offset question_line["text"] = unidecode(sent) o_questions.writerow(question_line) buzz_line = {} buzz_line["question"] = question buzz_line["sentence"] = ss buzz_line["word"] = tt buzz_line["page"] = guess buzz_line["final"] = 1 buzz_line["weight"] = 1.0 o_buzz.writerow(buzz_line) final_line = {} final_line["question"] = question
def slugify(text, delim=u'-'): """Generates an ASCII-only slug.""" result = [] for word in _punct_re.split(text.lower()): result.extend(unidecode(word).split()) return unicode(delim.join(result))