def extract_info(article):
    '''
    INPUT: dict object with output from the api
    OUTPUT: bool if extraction was successful or not,
            dict object to insert into mongodb
    '''
    headline = unidecode(article['webTitle'])
    date_published = str(article['webPublicationDate'])
    try:
        author = article['blocks']['body'][0]['createdBy']['firstName'] + ' ' + article['blocks']['body'][0]['createdBy']['lastName']
    except:
        author = None
    try:
        url = str(article['webUrl'])
    except:
        return False, ''
    try:
        article_text = '\n'.join([unidecode(text_block['bodyTextSummary']) for text_block in article['blocks']['body']])
    except:
        return False, ''
    insert = {'url': url,
              'source': 'guardian',
              'headline': headline,
              'date_published': date_published,
              'author': author,
              'article_text': article_text}
    return True, insert
Beispiel #2
0
Datei: htrc.py Projekt: inpho/vsm
def page_url(corpus, ctx_type, book_path, book_id, jsonfile):
    """
    Modified htrc_*_label_fn. The individual volumes don't have 'book' as a context type.
    """
    import json
    from vsm.viewer import doc_label_name
    import re

    urls = []
    corp_md = corpus.view_metadata("page")

    jsonpath = os.path.join(book_path, jsonfile)
    with open(jsonpath, "r") as f:
        md = json.load(f)
        url = ""
        li = sorted(md["items"], key=lambda k: int(k["lastUpdate"]))
        url = li[-1]["itemURL"]

        if ctx_type == "book":
            urls.append(unidecode(url))
        else:  # urls for pages
            page_md = corpus.view_metadata("page")
            files = page_md[doc_label_name("page")]

            nums = [re.findall("[1-9][0-9]*", a)[-1] for a in files]
            for i in nums:
                s = url + "?urlappend=%3Bseq={0}".format(i)
                urls.append(unidecode(s))
    return urls
Beispiel #3
0
    def get_items_by_letter(self):
        items_by_letter = OrderedDict()

        for letter, items in groupby(self.get_items(), self.sortkey):
            items_by_letter[unidecode(letter)] = tuple(sorted(items, key=lambda i: unidecode(self.get_title(i))))

        return items_by_letter
Beispiel #4
0
def crawl_video_urls(url='http://documentaryheaven.com/category/space/'):
    myopener = MyOpener()
    page = myopener.open(url)
    page = page.read()

    html = BeautifulSoup(page, "lxml")

    # find all class=post
    posts = html.find_all('div', class_="post")

    # for each class=post:
    for p in posts:
        obj = {}
        #class=post-title --> a (href, string)
        title = p.find('h2').find('a')
        obj['url'] = title['href']
        obj['title'] = unidecode(title.string)
        abstract = p.find('div', class_='browse-description').find('p')
        obj['abstract'] = unidecode(abstract.string).replace('\n', '').replace('\r\r', ' ').strip()
        #class=browse-description --> p (string)

        results.append(obj)
    # next page: class=next --> (href)
    next_page = html.find('a', class_="next page-numbers")

    if not next_page:
        return None
    print results
    print next_page['href']

    return crawl_video_urls(url=next_page['href'])
def extract_info(article):
    '''
    INPUT: dict object with output from the api
    OUTPUT: bool if extraction was successful or not,
            dict object to insert into mongodb
    '''
    headline = unidecode(article['title']['$text'])
    date_published = str(article['pubDate']['$text'])
    try:
        author = [str(author['name']['$text']) for author in article['byline']]
    except:
        author = None
    try:
        url = str(article['link'][0]['$text'])
    except:
        return False, ''
    try:
        article_text = unidecode(' '.join([line.get('$text', '\n') for line in article['text']['paragraph']]))
    except:
        return False, ''
    insert = {'url': url,
              'source': 'npr',
              'headline': headline,
              'date_published': date_published,
              'author': author,
              'article_text': article_text}
    return True, insert
Beispiel #6
0
 def save(self, force_insert=False, force_update=False, using=None,
          update_fields=None):
     if update_generated_value(self, 'name', 'search_name'):
         self.search_name = unidecode(self.name)
     if update_generated_value(self, 'surname', 'search_surname'):
         self.search_surname = unidecode(self.surname)
     super(KnownPlayer, self).save(force_insert, force_update, using, update_fields)
	def test_ascii(self):

		log = []
		def showwarning_new(message, category, *args):
			if ("not an unicode object" in str(message)) and \
					(category is RuntimeWarning):
				log.append((message, category))
			else:
				showwarning_old(message, category, *args)

		showwarning_old = warnings.showwarning
		warnings.showwarning = showwarning_new
		warnings.filterwarnings("always")

		for n in xrange(0,128):
			t = chr(n)
			self.assertEqual(unidecode(t), t)

		# Passing string objects to unidecode should raise a warning
		self.assertEqual(128, len(log))
		log = []

		for n in xrange(0,128):
			t = unichr(n)
			self.assertEqual(unidecode(t), t)

		# unicode objects shouldn't raise warnings
		self.assertEqual(0, len(log))

		warnings.showwarning = showwarning_old
def parsefile(f,inPre,titleSet,per,loc,org,other):
    fin = codecs.open(inPre+f, encoding = 'utf-8')
    for line in fin:
        if len(line.strip().split("\t")) != 11:
            continue
        ID,url,title,source,created_at,authors,key_word,snippets,raw_text,\
            h_tokens_ent,b_tokens_ent = line.strip().split("\t")
              #  h_tokens,b_tokens,\
        if title in titleSet:
            continue
        else:
            titleSet.add(title)
        if len(b_tokens_ent.split()) > MAX_BODY_LEN:            
            continue

        h_tokens_ent = unidecode.unidecode(h_tokens_ent.strip())
        b_tokens_ent = unidecode.unidecode(b_tokens_ent.strip())
        #h = grep_ent_with_context(h_tokens_ent,per,loc,org,other)  # fds_per_| asked me about ...
        #b = grep_ent_with_context(b_tokens_ent,per,loc,org,other)
        h = grep_ent(h_tokens_ent,per,loc,org,other) # fsd_per_| oregon_loc_| ...
        b = grep_ent(b_tokens_ent,per,loc,org,other)
        h = rep2.sub('', h)
        b = rep2.sub('', b)
        h = my_tokenizer(h, tokenizer)
        b = my_tokenizer(b, tokenizer)
        tokens = h+' '+h+' '+b  # title twice
        yield tokens.lower(),bk.News(ID,title,raw_text,snippets,key_word,source,created_at,f.split('.')[0],h_tokens_ent,b_tokens_ent) # can also leave lowercase to scikit
    fin.close()
Beispiel #9
0
def path(start, end):
  start =unidecode.unidecode(start)
  end = unidecode.unidecode(end)
  print start
  print end
  path = find_path(start, end)
  return jsonify(path=path)
def attributeUUID(context):
    sql_id = getattr(context, 'sql_id', None)
    try:
        sql_id = str(unidecode(str(sql_id)))
    except:
        sql_id = str(unidecode(sql_id))
    return str(getattr(context, 'portal_type', ''))+'-'+sql_id
Beispiel #11
0
def do_the_trick():
    print "\n\t-->> [Collecting]"
    global dot
    global samples
    dot, samples = do_fetch()
    if len(samples)>0:
        print "\n\t-->> [Playing]:"
    for ind_s, s in enumerate(samples):
        print "\n<.%s.>" % s['text']
        #threat msg for spacing and tokenizing
        for j,k in enumerate(kws):
            if unidecode(k).lower() in unidecode(s['text']).lower():                
                
                newTweet = M.tweetmetanalyze(unidecode(s['text']))
                ste = newTweet
                print "U:", ste

                #here, send osc
                try:
                    cmnd = OSC.OSCMessage("/tweet")
                    cmnd.append(ste)
                    cmnd.append(ind_s)
                    oscClient.send(cmnd)

                    cmnd = OSC.OSCMessage("/palabra")
                    cmnd.append(categorias_emotivas[gesto_to_class[k]])
                    cmnd.append(gesto_to_inten[k])
                    oscClient_ari.send(cmnd)

                except:
                    print '\n\tAquí le falló\n\t'
        sleep(randint(1,5))
Beispiel #12
0
def normalize_unicode(s):
    if type(s) == bytearray:
        return unidecode(unicode(str(s), 'utf-8')).lower()
    elif type(s) == unicode:
        return unidecode(s).lower()
    else:
        return unidecode(unicode(s, 'utf-8')).lower()
def parseCISI(title, tmdb_title = None):
    movs = search(title)
    mov = None
    mov_id = None
    imdb_id = None
    year = None
    ss = []
    sel = 'n'
    if movs is not None and len(movs) > 0:
        for m in movs:
            cisi_title = unidecode(m['title']).replace(',', '')
            if cisi_title.lower() == title.lower():
                sel = 'y'
                break
            elif title.lower() in cisi_title.lower() or cisi_title.lower() in title.lower():
                sel = raw_input(
                    "Matching '{}' with canistream.it '{}' ({})... OK? [y or n] ".format(
                        title
                        , cisi_title
                        , m['_id']
                    )
                ).lower()
                if sel == 'y':
                    break
            print("Trying again...")
    elif tmdb_title is not None:
        movs = search(tmdb_title)
        sel = 'n'
        if movs is not None and len(movs) > 0:
            for m in movs:
                cisi_title = unidecode(m['title'].decode('utf-8'))
                if cisi_title.lower() == tmdb_title.lower():
                    sel = 'y'
                    break
                elif tmdb_title.lower() in cisi_title.lower() or cisi_title.lower() in tmdb_title.lower():
                    sel = raw_input(
                        "Matching TMDB '{}' with canistream.it '{}' ({})... OK? [y or n] ".format(
                            tmdb_title
                            , cisi_title
                            , m['_id']
                        )
                    ).lower()
                    if sel == 'y':
                        break
                    else:
                        print("Trying again...")
    if sel == 'y':
        mov = m
        mov_id = str(m['_id'])
        year = int(m['year'])
        if 'imdb' in m['links'].keys():
            imdb_id = str(m['links']['imdb'].split("/")[-2])
    else:
        print("Unable to find match in canistream.it for '{}'".format(title))
    if mov is not None:
        ss = getStreams(mov_id)
        print("* MATCHED canistream.it")
    elif tmdb_title is not None:
        print("Streaming availability won't be available.")
    return mov_id, year, ss, imdb_id
Beispiel #14
0
 def get_absolute_url(self):
     return('nhom', (), {
         'monhoc': slugify(unicode(unidecode(self.mon_hoc.ten_mon))),
         'monhocid': self.mon_hoc_id,
         'nhom': slugify(unicode(unidecode(self.ten_nhom))),
         'nhomid': self.pk,
     })
Beispiel #15
0
def getIndividualSubject(roster_semester,subject):
  url = COURSE_ROSTER_API_CLASSES + roster_semester + '&subject=' + subject
  soup = BeautifulSoup(requests.get(url).text)

  classes = soup.find_all('class')
  for c in classes:
    listing = subject + c.find('catalognbr').text
    if listing not in COURSE_DICT:
      name = unidecode(c.find('titlelong').text.replace('\n', ' '))
      units_min = c.find('unitsminimum').text
      units_max = c.find('unitsmaximum').text
      if units_min == units_max:
        credits = units_min
      else:
        credits = units_min + "-" + units_max
      course_obj = Course(listing,name,credits)
      course_obj.description   = unidecode(c.find('description').text.replace('\n', ' '))
      course_obj.offered       = unidecode(c.find('catalogwhenoffered').text.replace('\n', ' '))
      course_obj.prerequisites = unidecode(c.find('catalogprereqcoreq').text.replace('\n', ' '))
      course_obj.arts_tags     = unidecode(c.find('catalogdistr').text.replace('\n', ' '))
      crosslists = []
      for combination in c.find_all('combination'):
        crosslists.append(combination.find('subject').text + combination.find('catalognbr').text)
      course_obj.crosslisted_classes = ";".join(crosslists)
      COURSE_DICT[listing] = course_obj
      print str(course_obj)
      print '-' * 50
Beispiel #16
0
    def js_signature(self, input_map=False):
        """Returns a the javascript signature for a prediction method.

        """
        objective_field = self.tree.fields[self.tree.objective_id]
        if not 'CamelCase' in objective_field:
            camelcase = to_camel_js(unidecode(objective_field['name']), False)
            objective_field['CamelCase'] = camelcase

        output = u"function predict%s(" % objective_field['CamelCase']

        args = []
        if len(self.tree.fields) > MAX_ARGS_LENGTH or input_map:
            args.append("data")
        else:
            for field in [(key, val) for key, val in
                          sort_fields(self.tree.fields)]:
                field_obj = self.tree.fields[field[0]]
                if not 'camelCase' in field_obj:
                    field_obj['camelCase'] = to_camel_js( \
                        unidecode(field_obj['name']))
                if field[0] != self.tree.objective_id:
                    args.append(u"%s" % field_obj['camelCase'])
        args_string = u", ".join(args)
        output += args_string + u")"

        return output
Beispiel #17
0
    def plug_in(self, out=sys.stdout, hadoop=False,
                filter_id=None, subtree=True):
        """Generates a basic javascript implementation of local predictions

        `out` is file descriptor to write the javascript code.

        """
        # fill the camelcase variable names with the JS_KEYWORDS restrictions
        objective_field = self.tree.fields[self.tree.objective_id]
        camelcase = to_camel_js(unidecode(objective_field['name']), False)
        objective_field['CamelCase'] = camelcase
        for field in [(key, val) for key, val in
                      sort_fields(self.tree.fields)]:
            field_obj = self.tree.fields[field[0]]
            field_obj['camelCase'] = to_camel_js(unidecode(field_obj['name']))

        body, term_analysis_predicates, item_analysis_predicates = \
            self.tree.plug_in_body()
        terms_body = ""
        items_body = ""
        if term_analysis_predicates:
            terms_body = self.js_term_analysis_body(term_analysis_predicates)
        if item_analysis_predicates:
            items_body = self.js_item_analysis_body(item_analysis_predicates)
        output = self.js_pre_body()
        output += terms_body + items_body + body
        output += u"%sreturn null;\n}\n" % INDENT
        if not PY3:
            output = output.encode("utf8")
        out.write(output)
        out.flush()
Beispiel #18
0
    def test_surrogate_pairs(self):
        # same character, written as a non-BMP character and a
        # surrogate pair
        s = _u('\U0001d4e3')

        # Note: this needs to be constructed at run-time, otherwise
        # a "wide" Python seems to optimize it automatically into a
        # single character.
        s_sp_1 = _u('\ud835')
        s_sp_2 = _u('\udce3')
        s_sp = s_sp_1 + s_sp_2

        if sys.version_info < (3,4):
            self.assertEqual(s.encode('utf16'), s_sp.encode('utf16'))
        else:
            self.assertEqual(s.encode('utf16'), s_sp.encode('utf16', errors='surrogatepass'))

        wlog = WarningLogger()
        wlog.start("Surrogate character")

        a = unidecode(s)
        a_sp = unidecode(s_sp)

        self.assertEqual('T', a)

        # Two warnings should have been logged
        self.assertEqual(2, len(wlog.log))

        wlog.stop()
Beispiel #19
0
def toascii(s):
    from unidecode import unidecode
    import codecs
    if isinstance(s,str):
        return unidecode(codecs.decode(s, 'utf-8'))
    elif isinstance(s,list):
        return map(lambda x:unidecode(codecs.decode(x, 'utf-8')),s)
Beispiel #20
0
def _fetch_artwork_info(image_id, page_url):
    """
    Scrape the artwork info page for relevant properties to return dict.
    """
    r = _get_response(page_url)
    soup = bs4.BeautifulSoup(r.text, "lxml")
    info = {}
    for tag in soup.find_all(lambda _tag: _tag.has_attr('itemprop')):
        itemprop_name = tag.attrs['itemprop']
        if itemprop_name == 'keywords':
            keywords = [x.strip().strip(',').lower() for x in tag.strings]
            print keywords
            keywords = [x for x in keywords if x != '']
            info[itemprop_name] = keywords
        elif tag.name == 'img':
            # itemprop='image'
            info[itemprop_name] = tag['src'].split('!')[0]
        else:
            # TODO: parse itemprop='name' differnetly,
            # as threre are 2 names: for artist and for artwork
            info[itemprop_name] = unidecode(tag.text.strip().lower())

    for tag in soup.find_all('div', attrs={'class': 'info-line'}):
        strings = [unidecode(x).lower() for x in tag.stripped_strings]
        if len(strings) == 0:
            continue
        if strings[0] == 'style:':
            info['style'] = '$'.join(strings[1:])
        elif strings[0] == 'media:':
            info['media'] = map(lambda s: s.strip(','), strings[1:])
            info['media'] = [x for x in info['media'] if len(x) > 0]
        elif strings[0] == 'location:':
            info['location'] = '$'.join(strings[1:])

    return info
Beispiel #21
0
def xml_to_txt(full_path, docType, docNumTag, sectionTag, secNumTag, secValueTag, noteTag):
    
    soup = Soup(open(full_path, "rb").read())

    docnum = soup.find(docNumTag).text.replace("Title ", "")
    
    for section in soup.findAll(sectionTag):
        parents = [x.name for x in section.parents]
        if noteTag not in parents:
            sec = ""
            try:
                sec = section.find(secNumTag)
                sec = unidecode(sec[secValueTag])
            except:
                sec = unidecode(section.find(secNumTag).text).replace("SS", "§").strip(" ").strip(".").split(" ")[-1]#.replace("Sec. ", "")
                sec = sec.replace(" to ", "_to_")
                sec = sec.replace(" through ", "_to_")
        
                sec = sec.strip()
            outpath = "../output/xml/{}/{}/sections/".format(docType, docnum)
            
            if not os.path.exists(outpath):
                os.makedirs(outpath)
    
            filepath = "{}{}.txt".format(outpath, sec)
            if not os.path.exists(filepath):
                f = open(filepath, 'w', encoding="utf-8")
                f.write(str(section))
                print("   --XML added to directory {}".format(filepath))
Beispiel #22
0
def create_station(genre, phone_number):
    station = Station.objects.create()
    account, created = Account.objects.get_or_create(phone_number=phone_number)
    account.stations.add(station)
    account.current_station = station
    account.save()
    artists = get_artists(genre)
    client = soundcloud.Client(client_id=settings.SOUNDCLOUD_CLIENT_ID)
    for artist in artists:
        tracks = client.get('/tracks', q=artist['name'], streamable=True)
        for track in tracks:
            if track.stream_url and track.streamable and track.duration < MAX_DURATION:
                try:
                    song = Song.objects.create(
                        title=unidecode(track.title),
                        sid=track.id, 
                        genre=unidecode(track.genre)
                    )
                    station_song = StationSong.objects.create(
                        song=song, 
                        index=station.songs.count()
                    )
                    station.songs.add(station_song)
                    station.save()
                    break
                except:
                    print "SoundCloud fail >.<" + str(track)
    return station.pk
Beispiel #23
0
    def _alignBySplittingToken(self, tag, word, t_iter):
        # alignment helper
        self.logger.debug('tag %s exceeds word %s', repr(tag.word), repr(word))
        tmp = list(tag)
        words = [word]
        asciis = [unidecode(word).replace('-', '')]
        tag_word = ''.join(self.tokenizer.split(tag.word))
        aligned = lambda: ''.join(asciis) == tag_word
        max_len = len(tag_word)
        aligned_tags = []

        while not aligned() and sum(map(len, asciis)) < max_len:
            words.append(next(t_iter))
            asciis.append(unidecode(words[-1]).replace('-', ''))

        if aligned():
            self.logger.debug('dropping tag %s [%s] for words "%s"',
                              repr(tag.word), tag[-1], ' '.join(words))

            for w, a in zip(words, asciis):
                tmp[0] = w
                tmp[1] = a
                self.logger.debug('adding tag %s [%s]', repr(w), tmp[-1])
                aligned_tags.append(Token(*tmp))

                for p in (3, 4):
                    if tmp[p].startswith('B-'):
                        tmp[p] = 'I' + tmp[p][1:]
        else:
            raise RuntimeError('alignment of words %s as %s to token "%s" as "%s" failed' % (
                repr(words), repr(asciis), tag.word, tag_word
            ))

        return aligned_tags
Beispiel #24
0
def _sort_glossary(qresult, lang):
    """
    Sort the result into categories and questions from response returned by the backend engine
    """
    glossary_content = []
    letters = []
    letters_found = OrderedDict()
    field = "glossary_term_lang_" + lang

    for i in string.ascii_uppercase:
        letters.append(i)
        letters_found[i] = 0

    if len(qresult) > 0:
        # Process results
        from itertools import groupby

        items = [o.get_stored_fields() for o in qresult]
        items = sorted(items, key=lambda x: unidecode(x[field]))
        for k, g in groupby(items, key=lambda x: unidecode(x[field])[0]):
            letters_found[k] = 1
            glossary_content.append(
                {
                    "letter": k,
                    "terms": [
                        {"term": item[field], "description": item["glossary_description_lang_" + lang]} for item in g
                    ],
                }
            )

    return letters, letters_found, glossary_content
def fast_iter(context, func,*args, **kwargs):
	collaborations = [u'www', u'phdthesis', u'inproceedings', u'incollection', u'proceedings', u'book', u'mastersthesis', u'article']
	#xml categories
	author_array = []
	title = ''

	#read chunk line by line
	#we focus author and title
	for event, elem in context:
		if elem.tag == 'author':
			author_array.append(unidecode(elem.text))

		if elem.tag == 'title':
			if elem.text:
				title = unidecode(elem.text)

		if elem.tag in collaborations:
			if len(author_array) is not 0 and title is not '':
				#rejected paper has no author or title
				#it should be check

				for a in author_array:
					func(a+"||"+title, *args, **kwargs)
					#write into kv file

				title = ''
				del author_array[:]

		elem.clear()
		while elem.getprevious() is not None:
			del elem.getparent()[0]
	del context
Beispiel #26
0
def scrape_wikitables():
    """Scrapes wikipedia for the list of current top boxers"""

    champURL = "https://en.wikipedia.org/wiki/List_of_current_boxing_rankings"
    page = urllib.request.urlopen(champURL)
    soup = bs4.BeautifulSoup(page, "html5lib")

    tables = soup.find_all("table", {"class": "wikitable"})
    unique_boxers = []

    for table_number in range(1, 6):
        table = tables[table_number]
        rows = table.find_all("tr")
        for row in rows:
            data = row.find_all("td")
            text = [i.text for i in data]
            for boxer_name in range(len(text)):
                if len(text[boxer_name]) > 3:
                    boxer_name = text[boxer_name].rstrip('\n')
                    boxer_name = re.findall(r"\S{3,}\ .[^\ \(]+", boxer_name)
                    if len(boxer_name) > 0:
                        if unidecode(boxer_name[0]) not in unique_boxers:
                            unique_boxers.append(unidecode(boxer_name[0]))

    unique_boxers.sort()
    return unique_boxers
def get_forms(c):
    global forms

    c.execute("SELECT DISTINCT species_id FROM pokemon WHERE id IN (SELECT pokemon_id FROM pokemon_forms WHERE form_identifier != 'NULL' ORDER BY pokemon_id) ORDER BY species_id")
    species_ids = c.fetchall()

    for i in range(len(species_ids)):
        c.execute("SELECT name FROM pokemon_species_names WHERE pokemon_species_id=%d AND local_language_id=9" % species_ids[i][0])
        species_name = str(unidecode(c.fetchone()[0])).replace("-","_").replace(" ","_").replace(".","").replace("'","")

        c.execute("SELECT pokemon_form_id,form_name FROM pokemon_form_names WHERE pokemon_form_id IN (SELECT id FROM pokemon_forms WHERE pokemon_id IN (SELECT id FROM pokemon WHERE species_id=%s)) AND local_language_id=9" % species_ids[i][0])
        species_forms = c.fetchall()

        form_index = []
        form_index += [species_name]
        for j in range(len(species_forms)):
            form_name = "STANDARD" if species_forms[j][1] == None else str(unidecode(species_forms[j][1])).replace("-","_").replace(" ","_").replace(".","").replace("'","").upper()
            form_name = form_name.replace("_FORME","").replace("_FORM","").replace("_TYPE","").replace("_ROTOM","").replace("???","QUESTION_MARK").replace("!","EXCLAMATION_MARK")
            form_name = form_name.replace("?","QUESTION_MARK").replace("_PATTERN","").replace("_KYUREM","").replace("_MODE","")

            if "MEGA" in form_name and "_X" in form_name:
                form_name = "MEGA_X"
            elif "MEGA" in form_name and "_Y" in form_name:
                form_name = "MEGA_Y"
            elif "MEGA" in form_name:
                form_name = "MEGA"

            form_index += [(species_forms[j][0], form_name)]

        forms += [form_index]
    def parse_authors(self):
        # Create authors
        print "Parsing Authors..."
        f = open(data_io.get_paths()["author_processed_path"], "r")
        titles = f.readline()
        for l in f.readlines():
            res = l.strip().split(",")
            # Titles
            raw_title = unidecode.unidecode(unicode(res[1], encoding="utf-8"))
            (name, surname) = nlp.filter_title(raw_title)
            try:
                self.surnames[surname] = self.surnames[surname] + 1
            except:
                self.surnames[surname] = 1

            #Affiliations
            raw_affiliation = unidecode.unidecode(unicode(res[2], encoding="utf-8"))
            affiliation = nlp.filter_affiliation(raw_affiliation)
            try:
                self.affiliations[affiliation] = self.affiliations[affiliation] + 1
            except:
                self.affiliations[affiliation] = 1
            self.authors[int(res[0])] = author.Author(int(res[0]), name, surname, affiliation)

        print "Done"
        f.close()
def main():
    URL_SENTIMENT140 = 'http://www.sentiment140.com/api/[email protected]'
    tweets = []
    for line in sys.stdin:
        try:
            tweetData = json.loads(line.decode('utf-8'))
            location = tweetData['user']['location'].strip()
            if location is None or bool(re.search(r'\d',location)):
                location = 'unknown'
            tempDataDict = {'text': unidecode(tweetData['text']), 'location':\
            unidecode(location.upper())}
            tweets.append(tempDataDict)
        except:
            continue
    dataToSend = {'data': tweets}
    try:
        response = urllib2.urlopen(URL_SENTIMENT140, str(dataToSend))
        sentimentJsonResponse = json.loads(response.read())
        parsedDataDict = parseResponse(sentimentJsonResponse)
        for key, value in parsedDataDict.items():
            print "{0}\t{1}".format(key, value)
    except HTTPError as e:
        print 'The server couldn\'t fulfill the request.'
        print 'Error code: ', e.code
    except URLError as e:
        print 'We failed to reach a server.'
        print 'Reason: ', e.reason
    except:
        print 'response from server is null or some error has occured'
Beispiel #30
0
    def get_map_pointers(self):
        pointers = {}
        projects = Project.objects.all()
        collection_points = CollectionPoint.objects.all()
        marker_project = 'markers/project.png'
        marker_collection = 'markers/collection.png'

        if projects:
            for project in projects:
                name = project.name
                name = unidecode(re.sub(r'[^a-zA-Z_]', '', name))

                pointers[name] = {
                    'latitude': project.latitude,
                    'longitude': project.longitude,
                    'url': project.get_absolute_url(),
                    'marker': marker_project,
                }

        if collection_points:
            for point in collection_points:
                name = u'{}{}'.format(point.name, "_c")
                name = unidecode(re.sub(r'[^a-zA-Z_]', '', name))

                pointers[name] = {
                    'latitude': point.latitude,
                    'longitude': point.longitude,
                    'url': reverse('addresses'),
                    'marker': marker_collection,
                }

        return pointers
def cleanText(text):
    "Removes punctuation and replaces spaces with underscores"
    text = unidecode(text)
    text = re.sub("[^A-Za-z0-9 _]", "", text)
    text = text.replace(" ", "_").lower()
    return text
 def save(self, *args, **kwargs):
     if not self.id or not self.slug:
         self.slug = slugify(unidecode(self.title))
     super().save(*args, **kwargs)
Beispiel #33
0
def generate_json(input_file, output_path):
    if not is_valid_txt_file(input_file):
        print(f'Skipping input file ({input_file}) as it is not a txt file')
        return

    global pk

    if 'american_football' in input_file:
        sport_name = 'American Football'
    elif 'basketball' in input_file:
        sport_name = 'Basketball'
    elif 'table_tennis' in input_file:
        sport_name = 'Table Tennis'
    elif 'tennis' in input_file:
        sport_name = 'Tennis'
    elif 'football' in input_file:
        sport_name = 'Football'
    elif 'golf' in input_file:
        sport_name = 'Golf'
    elif 'cricket' in input_file:
        sport_name = 'Cricket'
    elif 'boxing' in input_file:
        sport_name = 'Boxing'
    elif 'rugby' in input_file:
        sport_name = 'Rugby'
    elif 'motorsports' in input_file:
        sport_name = 'Motorsports'
    elif 'combatsport' in input_file:
        sport_name = 'Combat Sports'
    elif 'baseball' in input_file:
        sport_name = 'Baseball'
    elif 'hockey' in input_file:
        sport_name = 'Hockey'
    else:
        print(f'Skipping input file ({input_file}) as the file name does provide a match with the known list of sports '
              f'which would break the relation field in the category model to a sport row in the db as it requires a '
              f'valid natural key')
        return

    try:
        f = open(input_file, encoding="utf-8")
    except FileNotFoundError as err:
        print(str(err))

    data = []

    date = '2019-09-01T00:00:00.000Z'

    line = f.readline()
    while line:
        unaccented_string = unidecode.unidecode(line[:-1]) if line.endswith('\n') else unidecode.unidecode(line)
        term = {
            'model': 'dictionary.category',
            'pk': pk,
            'fields': {
                'name': unaccented_string,
                'sport': (sport_name,)
            }
        }
        data.append(term)
        pk = pk + 1
        line = f.readline()

    f.close()

    json_file = os.path.splitext(input_file)[-2] + '.json'
    json_file_path = output_path + json_file
    with io.open(json_file_path, 'w', encoding='utf8') as json_file:
        json.dump(data, json_file, ensure_ascii=False, indent=4)
def format_str(str):
	return unidecode(str.replace('"', ''))
Beispiel #35
0
# -*- coding: utf-8 -*-
import unicodedata
import string

from unidecode import unidecode


def remove_accents(data):
    return ''.join(x for x in unicodedata.normalize('NFKD', data)
                   if x in string.ascii_letters or x == '_').lower()


def remove_accents2(data):
    return filter(lambda char: char in string.ascii_uppercase, data.upper())


s = 'Números_distântes'.encode('utf8')
print(s)
print(unidecode(s.decode('utf-8')))
Beispiel #36
0
 def _serialize_lote_rps(self, dados_lote_rps, dados_servico):
     dados_tomador = self._prepare_dados_tomador()
     return tpRPS(
         Assinatura=self.assinatura_rps(dados_lote_rps, dados_servico,
                                        dados_tomador),
         ChaveRPS=tpChaveRPS(
             InscricaoPrestador=self.convert_type_nfselib(
                 tpChaveRPS, 'InscricaoPrestador',
                 dados_lote_rps['inscricao_municipal'].zfill(8)),
             SerieRPS=self.convert_type_nfselib(
                 tpChaveRPS, 'SerieRPS', dados_lote_rps['serie']),
             NumeroRPS=self.convert_type_nfselib(
                 tpChaveRPS, 'NumeroRPS', dados_lote_rps['numero']),
         ),
         TipoRPS=self._map_type_rps(dados_lote_rps['tipo']),
         DataEmissao=self.convert_type_nfselib(
             tpRPS, 'DataEmissao',
             dados_lote_rps['data_emissao'].split('T', 1)[0]),
         StatusRPS=self.convert_type_nfselib(tpRPS, 'StatusRPS', 'N'),
         TributacaoRPS=self.convert_type_nfselib(
             tpRPS, 'TributacaoRPS',
             self._map_taxation_rps(dados_lote_rps['natureza_operacao'])),
         ValorServicos=self.convert_type_nfselib(
             tpRPS, 'ValorServicos', dados_servico['valor_servicos']),
         ValorDeducoes=self.convert_type_nfselib(
             tpRPS, 'ValorDeducoes', dados_servico['valor_deducoes']),
         ValorPIS=self.convert_type_nfselib(
             tpRPS, 'ValorPIS', dados_servico['valor_pis']),
         ValorCOFINS=self.convert_type_nfselib(
             tpRPS, 'ValorCOFINS', dados_servico['valor_cofins']),
         ValorINSS=self.convert_type_nfselib(
             tpRPS, 'ValorINSS', dados_servico['valor_inss']),
         ValorIR=self.convert_type_nfselib(
             tpRPS, 'ValorIR', dados_servico['valor_ir']),
         ValorCSLL=self.convert_type_nfselib(
             tpRPS, 'ValorCSLL', dados_servico['valor_csll']),
         CodigoServico=self.convert_type_nfselib(
             tpRPS, 'CodigoServico',
             dados_servico['codigo_tributacao_municipio']),
         AliquotaServicos=self.convert_type_nfselib(
             tpRPS, 'AliquotaServicos', dados_servico['aliquota']),
         ISSRetido='true' if dados_servico['iss_retido'] == '1' else 'false',
         # FIXME: Hardcoded
         CPFCNPJTomador=self.convert_type_nfselib(
             tpRPS, 'CPFCNPJTomador', tpCPFCNPJ(
                 CNPJ=dados_tomador['cnpj'], CPF=dados_tomador['cpf'])),
         InscricaoMunicipalTomador=self.convert_type_nfselib(
             tpRPS, 'InscricaoMunicipalTomador',
             dados_tomador['inscricao_municipal']),
         InscricaoEstadualTomador=self.convert_type_nfselib(
             tpRPS, 'InscricaoEstadualTomador',
             dados_tomador['inscricao_estadual']),
         RazaoSocialTomador=self.convert_type_nfselib(
             tpRPS, 'RazaoSocialTomador', dados_tomador['razao_social']),
         EnderecoTomador=tpEndereco(
             Logradouro=self.convert_type_nfselib(
                 tpEndereco, 'Logradouro', dados_tomador['endereco']),
             NumeroEndereco=self.convert_type_nfselib(
                 tpEndereco, 'NumeroEndereco', dados_tomador['numero']),
             ComplementoEndereco=self.convert_type_nfselib(
                 tpEndereco, 'ComplementoEndereco',
                 dados_tomador['complemento']),
             Bairro=self.convert_type_nfselib(
                 tpEndereco, 'Bairro', dados_tomador['bairro']),
             Cidade=self.convert_type_nfselib(
                 tpEndereco, 'Cidade', dados_tomador['codigo_municipio']),
             UF=self.convert_type_nfselib(
                 tpEndereco, 'UF', dados_tomador['uf']),
             CEP=self.convert_type_nfselib(
                 tpEndereco, 'CEP', dados_tomador['cep']),
         ),
         EmailTomador=self.convert_type_nfselib(
             tpRPS, 'EmailTomador', dados_tomador['email']),
         Discriminacao=self.convert_type_nfselib(
             tpRPS, 'Discriminacao', unidecode(
                 dados_servico['discriminacao'] + (
                     '|%s|' % self.fiscal_additional_data.replace(
                         '\n', '|'
                     ) if self.fiscal_additional_data else ''))
         ),
         ValorCargaTributaria=self.convert_type_nfselib(
             tpRPS, 'ValorCargaTributaria',
             dados_lote_rps['carga_tributaria']),
         FonteCargaTributaria=self.convert_type_nfselib(
             tpRPS, 'FonteCargaTributaria',
             dados_lote_rps['total_recebido']),
         MunicipioPrestacao=self.convert_type_nfselib(
             CabecalhoType, 'Versao',
             self._map_provision_municipality(
                 dados_lote_rps['natureza_operacao'],
                 dados_servico['codigo_municipio']
             )
         ),
     )
Beispiel #37
0
def remove_non_ascii(text):
    return unidecode(text)
                name_to_find = node_in_file.split(',')[1].split('\n')[0].replace('_', ' ').title()
                id_of_searched = int(node_in_file.split(',')[0])
                print(name_to_find)
                print(id_of_searched)
                print(list_of_nodes_to_add)
                print(list_of_edges_to_add)

                # skip on passed ids
                if id_of_searched < position:
                    continue

                co_authors = AW.all_co_authors(name_to_find)
                # run on every publications and adding it to "nodes_by_lines"
                print(co_authors)
                for co_author in co_authors:
                    author = unidecode(co_author.lower())
                    author = ''.join([i for i in author if (i.isalpha()) | (i == ' ') | (i == '-') | (i == '.') | (i == "'") | (i == "(") | (i == ")")])
                    author_splitted = author.split(' ')
                    # normalizing names
                    normalized_author = ""
                    if Got_Here:
                        print("Got Here0")
                    for i in range(len(author_splitted)):
                        if (not author_splitted[i].isnumeric()):
                            normalized_author += author_splitted[i] + '_'
                    normalized_author = normalized_author[:len(normalized_author)-1]
                    normalized_splitted = normalized_author.split('_')
                    
                    if Got_Here:
                        print("Got Here1")
                    # find first and last name which not contain '.' in them
Beispiel #39
0
def main():
    text = easygui.codebox("Enter the box-level inventory below.",
                           "Enter Data")
    if not text or text.strip() == "":
        return
    # Remove potential whitespace at beginning and end
    text = unidecode.unidecode(text.strip())

    # First get settings header if it exists
    json_str = re.findall("^{.*}", text)
    json_str = json_str[0] if json_str else None
    opts = options.get_options(json_str)
    text = re.sub("^{.*}", "", text).strip()

    lines = text.split('\n')
    # Get first box number, first by trying to find it, then by inputbox if not
    first_num = re.findall(r'^[\s\n]*\d{1,3}', text)
    first_num = int(first_num[0].strip()) if first_num else easygui.integerbox(
        "Enter the box number.",
        "Enter Box Number",
        lowerbound=1,
        upperbound=999)
    first_num = first_num if first_num else 1

    entries = []
    last_entry = None
    extras = 0
    for l in lines:
        l = re.sub(r'\s+', " ", l).strip()
        # Perform user substitutions per line
        if opts["user_regex"] is not None and opts["user_subst"] is not None:
            l = opts["user_regex"].sub(opts["user_subst"], l)

        if re.match(r'^\d+(?:\-\d+)?\s', l) and last_entry:
            entries.append(last_entry)
            for _ in range(extras):
                entries.append(last_entry)
            last_entry = None
            extras = 0

        if re.match(r'^\d+(?:\-\d+)?\s', l):
            while re.match(r'^\d+(?:\-\d+)?\s', l):
                # If we have a range, determine the range
                rng = re.findall(r'^\d+\-\d+\s', l)
                if rng:
                    rng = [int(x) for x in rng[0].strip().split('-')]
                    extras = rng[1] - rng[0]
                l = re.sub(r'^\d+(?:\-\d+)?\s', "", l).strip()
            last_entry = l
        else:
            last_entry += "\n" + l
    if last_entry:
        entries.append(last_entry)
        for _ in range(extras):
            entries.append(last_entry)

    contents = ""
    for i, e in enumerate(entries, start=first_num):
        e = e.strip()  # Remove unnecessary leading/trailing space
        contents += "box\t"
        contents += "Box " + str(i)
        # Get all years from current entry. The replace business will change a
        # year like '66 to 1966 or '01 to 2001 (19/20 dependent on current year)
        years = [
            y.replace("'", "19" if int(y[1:]) > datetime.now().year %
                      100 else "20") if len(y) < 4 else y
            for y in re.findall(r"(?:[1-2]\d{3}|'\d\d)", e)
        ]
        years = [
            int(y) for y in years
            if opts["min_year"] <= int(y) <= datetime.now().year
        ]
        if years:
            if len(years) >= 2:
                # Pick the min and max (for out of order years or many years)
                years = [min(years), max(years)]
                # If they're the same, just make the second one blank
                if years[0] == years[1]:
                    years[1] = ""
            if len(years) == 1:
                # Add a blank entry so we have a tab still
                years.append("")
            contents += "\t" + "\t".join(str(y) for y in years) + "\t"
        else:
            contents += "\t\t\tn.d."
        contents += "\t" + str(i) + "\tbox\t\t"
        contents += '"' + e.replace('"', '""') + '"'
        contents += "\n"

    pyperclip.copy(contents[:-1])
    easygui.msgbox("Copied to clipboard.", "Done")
Beispiel #40
0
 def _is_field_common_name(self, field: str) -> bool:
     field = unidecode(field).upper()
     return field in self.FIELD_NAMES
def slugify(title):
    return re.sub(r'\W+', '-',
                  str(unidecode.unidecode(title)).lower()).strip(' -')
Beispiel #42
0
def slugify(str, separator='_'):
    str = unidecode.unidecode(str).lower().strip()
    return re.sub(r'\W+', separator, str).strip(separator)
Beispiel #43
0
        def lhandler(default, toconf, show_header=True):
            if show_header:
                print(
                    "We will now ask you to provide the list of languages you want to use."
                )
                print(
                    "Please list all the desired languages, comma-separated, using ISO 639-1 codes.  The first language will be used as the default."
                )
                print(
                    "Type '?' (a question mark, sans quotes) to list available languages."
                )
            answer = ask('Language(s) to use', 'en')
            while answer.strip() == '?':
                print('\n# Available languages:')
                try:
                    print(SAMPLE_CONF['_SUPPORTED_LANGUAGES'] + '\n')
                except UnicodeEncodeError:
                    # avoid Unicode characters in supported language names
                    print(
                        unidecode.unidecode(
                            SAMPLE_CONF['_SUPPORTED_LANGUAGES']) + '\n')
                answer = ask('Language(s) to use', 'en')

            langs = [
                i.strip().lower().replace('-', '_') for i in answer.split(',')
            ]
            for partial, full in LEGAL_VALUES[
                    '_TRANSLATIONS_WITH_COUNTRY_SPECIFIERS'].items():
                if partial in langs:
                    langs[langs.index(partial)] = full
                    print("NOTICE: Assuming '{0}' instead of '{1}'.".format(
                        full, partial))

            default = langs.pop(0)
            SAMPLE_CONF['DEFAULT_LANG'] = default
            # format_default_translations_config() is intelligent enough to
            # return the current value if there are no additional languages.
            SAMPLE_CONF['TRANSLATIONS'] = format_default_translations_config(
                langs)

            # Get messages for navigation_links.  In order to do this, we need
            # to generate a throwaway TRANSLATIONS dict.
            tr = {default: ''}
            for l in langs:
                tr[l] = './' + l
            # Assuming that base contains all the locales, and that base does
            # not inherit from anywhere.
            try:
                messages = load_messages(['base'],
                                         tr,
                                         default,
                                         themes_dirs=['themes'])
                SAMPLE_CONF['NAVIGATION_LINKS'] = format_navigation_links(
                    langs, default, messages, SAMPLE_CONF['STRIP_INDEXES'])
            except nikola.utils.LanguageNotFoundError as e:
                print("    ERROR: the language '{0}' is not supported.".format(
                    e.lang))
                print(
                    "    Are you sure you spelled the name correctly?  Names are case-sensitive and need to be reproduced as-is (complete with the country specifier, if any)."
                )
                print(
                    "\nType '?' (a question mark, sans quotes) to list available languages."
                )
                lhandler(default, toconf, show_header=False)
def remove_nonunicode(tweet):
    return bytes(unidecode.unidecode(tweet), 'utf-8').decode('utf-8', 'ignore')
Beispiel #45
0
# coding: utf-8

# In[3]:

from unidecode import unidecode
import pandas as pd

scores = pd.read_csv("Score.csv")

# remove all the accents
for i in range(len(scores)):
    scores.LastName[i] = unidecode(unicode(scores.LastName[i]))
    scores.FirstName[i] = unidecode(unicode(scores.FirstName[i]))

scores.to_csv("Score_no_accent.csv")

Beispiel #46
0
def read_metadata(xml_file, module_id, package_type, year_str, month_str,
                  month_str_ar):
    metadata = {}
    mun_name = Municipality.query.filter_by(
        municipal_id=current_user.municipal_id).first()
    tree = et.parse(xml_file)
    soup = tree.getroot()
    for item in soup.findall('module'):
        if int(item.get('id')) == module_id:
            for pack in item.find('packages').findall('package'):
                if package_type == pack.get('id'):
                    metadata = {
                        "name":
                        pack.find('name').text.replace(
                            'mun_name',
                            unidecode.unidecode(
                                mun_name.municipal_name)).lower().replace(
                                    ' ', '-'),
                        "title":
                        pack.find('title').text.replace(
                            'mun_name', mun_name.municipal_name),
                        "title_ar":
                        pack.find('title_ar').text.replace(
                            'mun_name_ar', mun_name.municipal_name_ar),
                        "notes":
                        pack.find('notes').text.replace(
                            'mun_name', mun_name.municipal_name),
                        "notes_ar":
                        pack.find('notes_ar').text.replace(
                            'mun_name_ar', mun_name.municipal_name_ar),
                        "frequency_update":
                        pack.find('frequency_update').text,
                        "keywords": {
                            "ar": [
                                _.replace('mun_name_ar', '') for _ in
                                pack.find('keywords_ar').text.split(',')
                            ],
                            "fr": [
                                _.replace('mun_name', mun_name.municipal_name)
                                for _ in pack.find('keywords_fr').text.split(
                                    ',')
                            ]
                        },
                        "author":
                        current_user.name + ' ' + current_user.last_name,
                        "author_email":
                        current_user.email,
                        "maintainer":
                        current_user.name + ' ' + current_user.last_name,
                        "maintainer_email":
                        current_user.email,
                        "owner_org":
                        mun_name.ckan_id,
                        "private":
                        False,
                        "license_id":
                        'cc-by',
                        "groups": [{
                            'name': pack.find('groups').text
                        }],
                        "resources": []
                    }
                    for res in pack.find('resources').findall('resource'):
                        metadata['resources'].append({
                            'description':
                            res.find('description').text.replace(
                                'mun_name', mun_name.municipal_name).replace(
                                    'YYYY',
                                    year_str).replace('MMMM', month_str),
                            'description_ar':
                            res.find('description_ar').text.replace(
                                'mun_name_ar',
                                mun_name.municipal_name_ar).replace(
                                    'YYYY',
                                    year_str).replace('MMMM_ar', month_str_ar),
                            'name':
                            res.find('name').text.replace(
                                'mun_name', mun_name.municipal_name).replace(
                                    'YYYY',
                                    year_str).replace('MMMM', month_str),
                            'name_ar':
                            res.find('name_ar').text.replace(
                                'mun_name_ar',
                                mun_name.municipal_name_ar).replace(
                                    'YYYY',
                                    year_str).replace('MMMM_ar', month_str_ar),
                            'format':
                            res.find('format').text,
                            'type':
                            res.get('id')
                        })
                    break
            break
    return metadata
Beispiel #47
0
def convert_name(name):
    ascii_name = unidecode(name.replace(" ", "_")).lower().replace(" ", "-")
    return re.sub('-_', '_', ascii_name)
Beispiel #48
0
 def __call__(self, instance, filename):
     filename, ext = filename.rsplit('.', 1)
     filename = re.sub(r'[_.,:;@#$%^&?*|()\[\]]', '-', filename)
     filename = slugify(unidecode(smart_text(filename)))
     full_filename = '.'.join([filename, ext])
     return os.path.join(self.sub_path, full_filename)
Beispiel #49
0
def urlify(s):
    s = re.sub(r"[^\w\s\-]", '', s)
    s = re.sub(r"\s+", '-', s).lower()
    return unidecode(s)
Beispiel #50
0
def task_submit_post(request, task_id, submit_type):
    """Spracovanie uploadnuteho submitu"""
    try:
        submit_type = int(submit_type)
    except ValueError:
        raise HttpResponseBadRequest

    # Raise Not Found when submitting non existent task
    task = get_object_or_404(Task, pk=task_id)

    # Raise Not Found when submitting non-submittable submit type
    if not task.has_submit_type(submit_type):
        raise Http404

    # Raise Not Found when not submitting through POST
    if request.method != "POST":
        raise Http404

    try:
        sfile = request.FILES["submit_file"]
    except:  # noqa: E722 @FIXME
        # error will be reported from form validation
        pass

    # File will be sent to tester
    if (submit_type == constants.SUBMIT_TYPE_SOURCE
            or submit_type == constants.SUBMIT_TYPE_TESTABLE_ZIP):
        if submit_type == constants.SUBMIT_TYPE_SOURCE:
            form = SourceSubmitForm(request.POST, request.FILES)
        else:
            form = TestableZipSubmitForm(request.POST, request.FILES)
        if form.is_valid():
            if submit_type == constants.SUBMIT_TYPE_SOURCE:
                language = form.cleaned_data["language"]
            else:
                language = ".zip"
            # Source submit's should be processed by process_submit()
            submit_id = process_submit(sfile, task, language, request.user)
            if not submit_id:
                messages.add_message(request, messages.ERROR,
                                     "Nepodporovaný formát súboru")
            else:
                # Source file-name is id.data
                sfiletarget = unidecode(
                    os.path.join(
                        get_path(task, request.user),
                        submit_id + constants.SUBMIT_SOURCE_FILE_EXTENSION,
                    ))
                write_chunks_to_file(sfiletarget, sfile.chunks())
                sub = Submit(
                    task=task,
                    user=request.user,
                    submit_type=submit_type,
                    points=0,
                    filepath=sfiletarget,
                    testing_status=constants.SUBMIT_STATUS_IN_QUEUE,
                    protocol_id=submit_id,
                )
                sub.save()
                if task.email_on_code_submit:
                    send_notification_email(sub, task_id, submit_type)

                success_message = format_html(
                    "Úspešne si submitol program, výsledok testovania nájdeš "
                    '<a href="{}">tu</a>',
                    reverse("view_submit", args=[sub.id]),
                )
                messages.add_message(request, messages.SUCCESS,
                                     success_message)
        else:
            for field in form:
                for error in field.errors:
                    messages.add_message(request, messages.ERROR,
                                         "%s: %s" % (field.label, error))
        if "redirect_to" in request.POST and request.POST["redirect_to"]:
            return redirect(request.POST["redirect_to"])
        else:
            return redirect(
                reverse("task_submit_page", kwargs={"task_id": int(task_id)}))

    # File won't be sent to tester
    elif submit_type == constants.SUBMIT_TYPE_DESCRIPTION:
        if request.user.is_competition_ignored(
                task.round.semester.competition):
            return HttpResponseForbidden()
        form = DescriptionSubmitForm(request.POST, request.FILES)
        if form.is_valid():
            sfiletarget = get_description_file_path(sfile, request.user, task)
            write_chunks_to_file(sfiletarget, sfile.chunks())
            sub = Submit(
                task=task,
                user=request.user,
                submit_type=submit_type,
                points=0,
                testing_status=constants.SUBMIT_STATUS_IN_QUEUE,
                filepath=sfiletarget,
            )
            sub.save()
            if task.email_on_desc_submit:
                send_notification_email(sub, task_id, submit_type)

            if task.round.can_submit:
                messages.add_message(
                    request,
                    messages.SUCCESS,
                    _("You have successfully submitted your description, "
                      "it will be reviewed after the round finishes."),
                )
            else:
                messages.add_message(
                    request,
                    messages.WARNING,
                    _("You have submitted your description after the deadline. "
                      "It is not counted in results."),
                )
        else:
            for field in form:
                for error in field.errors:
                    messages.add_message(request, messages.ERROR,
                                         "%s: %s" % (field.label, error))

        if "redirect_to" in request.POST and request.POST["redirect_to"]:
            return redirect(request.POST["redirect_to"])
        else:
            return redirect(
                reverse("task_submit_page", kwargs={"task_id": int(task_id)}))

    else:
        # Only Description and Source and Zip submitting is developed currently
        raise Http404
def safe_filename(accented_string):
    """ make a safe filename with no non-ascii chars """
    return "".join([c for c in unidecode.unidecode(accented_string) \
        if c.isascii() or c.isdigit() or c == ' ']).rstrip()
Beispiel #52
0
 def get_valid_name(self, name):
     print 'It is called'
     return unidecode(name)
Beispiel #53
0
    # for ff in ["label", "wikilinks"]:
    for ff in ["label"] + kFEATURES.keys():
        print("Loading %s" % ff)
        feat = instantiate_feature(ff, qdb)
        if ff == "label":
            meta = open("features/expo/%s.meta" % flags.granularity, 'w')
        else:
            meta = None

        # Open the feature file for output
        filename = ("features/%s/%s.%s.feat" % ('expo', flags.granularity, ff))
        print("Opening %s for output" % filename)
        o = open(filename, 'w')

        for page in questions:
            for qq in questions[page]:
                for ss, tt, pp, line in feature_lines(qq, guess_list,
                                                      flags.granularity, feat):
                    assert ff is not None
                    o.write("%s\n" % line)

                    if not meta is None:
                        meta.write("%i\t%i\t%i\t%s\n" %
                                   (qq.qnum, ss, tt, unidecode(pp)))

                o.flush()
        o.close()
        print("Done with %s" % ff)
        # now that we're done with it, delete the feature
        del feat
Beispiel #54
0
    def post(self, request):
        args = {}
        if request.method == 'POST':
            edit_control = self.request.POST.get('control')
            first_title = self.request.POST.get('first_title')
            form = PostForm(request.POST)
            print(form.is_valid())

            if form.is_valid():
                post = form.save(commit=False)

                if edit_control == "edit":
                    post = Post.objects.get(author=request.user,
                                            title=first_title)
                    post.title = form.cleaned_data['title']
                    post.category = form.cleaned_data['category']
                    post.body = form.cleaned_data['body']
                    post.allow_comments = form.cleaned_data['allow_comments']
                    text = unidecode.unidecode(post.title).lower()
                    post.slug = re.sub(r'[\W_]+', '-', text)
                    post.save(update_fields=[
                        'title', 'category', 'body', 'allow_comments', 'slug'
                    ])
                    PostListView.as_view()(self.request)
                    return HttpResponseRedirect(post.get_absolute_url())

                elif edit_control == "delete":
                    Post.objects.get(author=request.user,
                                     title=first_title).delete()
                    PostListView.as_view()(self.request)
                    return render(request, self.template_name, {
                        'delete_info': True,
                        'first_title': first_title
                    })

                # add-post alanı
                try:
                    exist_blog = Post.objects.get(
                        title=form.cleaned_data['title'])
                    return render(request, self.template_name, {
                        'exist_blog': exist_blog,
                        'post_form': form
                    })

                except Post.DoesNotExist:

                    post.author = request.user
                    post.publish = timezone.now()
                    post.title = form.cleaned_data['title']
                    post.category = form.cleaned_data['category']
                    post.body = form.cleaned_data['body']
                    post.allow_comments = form.cleaned_data['allow_comments']
                    text = unidecode.unidecode(post.title).lower()
                    post.slug = re.sub(r'[\W_]+', '-', text)
                    form.save()
                    PostListView.as_view()(self.request)
                    return HttpResponseRedirect(post.get_absolute_url())

                #

            else:
                print(form.errors)
        else:
            pass
        return PostListView.as_view()(self.request)
def generate_misspell_sample(query, n=25, max_edit_distance=4):
    tokens = query.split()
    if len(tokens) == 1:
        n = 15
    elif len(tokens) == 2:
        n = 25
    elif len(tokens) == 3:
        n = 35
    else:
        n = 45
    clenq = len(query)
    vv = clenq / 4
    if vv == 0:
        vv = 1
    max_edit_distance = int(min(vv, max_edit_distance))

    results = set()

    for _ in range(n):
        actions = [random.choice(ACTIONS) for i in range(max_edit_distance)]
        qx = query

        for a in actions:
            clen = len(qx)
            pos = random.randint(0, clen - 1)

            if a == "DO_NOTHING":
                continue

            if a == "INSERT":
                rc = random.sample(CHARS, 1)[0]
                qx = qx[:pos + 1] + rc + qx[pos + 1:]
            elif a == "REMOVE":
                qx = qx[:pos] + qx[pos + 1:]
            elif a == "NORMAL_REPLACE":
                c = qx[pos]
                if c not in SAMPLING_MAP:
                    qx = qx[:pos] + random.choice(CHARS) + qx[pos + 1:]
                else:
                    qx = qx[:pos] + \
                        random.choice(SAMPLING_MAP[c]) + qx[pos + 1:]
            elif a == "TELEX_REPLACE":
                c = qx[pos]
                if c not in VN_TELEX:
                    if c not in SAMPLING_MAP:
                        qx = qx[:pos] + random.choice(CHARS) + qx[pos + 1:]
                    else:
                        qx = qx[:pos] + \
                            random.choice(SAMPLING_MAP[c]) + qx[pos + 1:]
                else:
                    if pos < len(qx) - 1:
                        nc = qx[pos + 1]
                        telex_c = random.choice(VN_TELEX[c])
                        if nc != u" ":
                            qx = qx[:pos] + telex_c[0] + \
                                nc + telex_c[1] + qx[pos + 2:]
                        else:
                            qx = qx[:pos] + telex_c[0] + \
                                telex_c[1] + qx[pos + 1:]
            elif a == "UNACCENT":
                tokens = qx.split()
                tc = random.randint(0, len(tokens) - 1)
                tokens[tc] = unidecode(tokens[tc])
                qx = u" ".join(tokens)

        qx = u" ".join(qx.split())
        if qx != query:
            results.add(qx)

    return results
# Get latest IDs because for some reason Papers and Works aren't autoincrementing
paper_id = Papers.objects.latest("paperid").paperid + 1
author_id = Authors.objects.latest("authorid").authorid + 1


for article in articles:
    title = article.find(class_="articleTitle").get_text().strip()
    # Conference proceeding entries look like articles but don't have an articleauthors field
    try:
        authors = article.find(class_="articleAuthors").find_all("a")
    except AttributeError:
        continue
    authors_sql = []
    for author in authors:
        # Sometimes names are accented, we avoid that by converting to unidecode
        author_string = unidecode(author.string)
        try:
            author_sql = Authors.objects.get(authorname=author_string)
        except Authors.DoesNotExist:
            author_sql = Authors.objects.create(authorname=author_string, authorid=author_id)
            author_id += 1
            author_sql.save()
        authors_sql.append(author_sql)
    doi = article.find(class_="articleCitation").get_text().strip().split()[0]
    paper = Papers.objects.create(paperid=paper_id, title=title, doi=doi, numauthors=len(authors_sql))
    paper_id += 1
    paper.save()
    for author in authors_sql:
        work = Works.objects.create(authorid=author.authorid, paperid=paper.paperid)
        work.save()
Beispiel #57
0
 def remove_tone(self, s):
     return unidecode.unidecode(s)
Beispiel #58
0
def get_missing_deets(url):
    print(url)
    r = requests.get(url)
    page = BeautifulSoup(r.content)

    missing_dict = {}
    for body in page.findAll('tbody'):
        if None != body.find('span').find('p'):
            if 'Missing' in body.find('span').find('p').text:
                ## prep to extract
                num = len(body.findAll('p'))
                if num > 1:
                    txt = body.findAll('p')[1].text
                    name = body.find('span').find('p').text.split(
                        'Missing Child: ')[1]
                else:
                    txt = body.text
                    name = page.find('title').text
                txt = txt.replace('\n', '')
                for i in range(10):
                    txt = txt.replace('  ', ' ').strip(' ')

                for col in COLS:
                    txt = txt.replace(col, ' :{}'.format(col))
                txt = txt.strip(' ')

                ## get info about child
                tmp_dict = {'abductor': False}
                keys = txt.split(':')[1:][::2]
                values = [
                    unidecode(x).strip(' ') for x in txt.split(':')[1:][1::2]
                ]
                for i in range(len(keys)):
                    tmp_dict.update(
                        {keys[i].replace(' ', '_').lower(): values[i]})

                ## update overall dict
                missing_dict.update({name: tmp_dict})
        elif 'Abductor' in body.text:
            ## clean
            txt = unidecode(body.text.replace('\n', ''))
            for i in range(10):
                txt = txt.replace('  ', ' ').strip(' ')
            for col in COLS:
                txt = txt.replace(col, ' :{}'.format(col))
            txt = txt.strip(' ')

            ## get info about abuductor
            tmp_dict = {'abductor': True}
            keys = txt.split(':')[1:][::2]
            values = [
                unidecode(x).strip(' ') for x in txt.split(':')[1:][1::2]
            ]
            key_len = np.array([len(keys), len(values)]).min()
            for i in range(len(keys)):
                if keys[i] == 'Abductor':
                    name = values[i]
                else:
                    tmp_dict.update(
                        {keys[i].replace(' ', '_').lower(): values[i]})
            missing_dict.update({name: tmp_dict})

    df = pd.DataFrame.from_dict(missing_dict).transpose().reset_index().rename(
        columns={'index': 'name'})
    images = [
        x.get('src') for x in page.findAll('img', src=True)
        if 'missing/kids/' in x.get('src')
    ]
    df['image'] = images[:len(df)]
    return df
Beispiel #59
0
        open(flags.buzz, "w"),
        ["question", "sentence", "word", "page", "evidence", "final", "weight"],
    )
    o_buzz.writeheader()

    o_final = DictWriter(open(flags.final, "w"), ["question", "answer"])
    o_final.writeheader()

    for question in results:
        pos, guess = results[question]
        ss, tt = word_position_to_sent(questions, question, pos)

        for sent_offset, sent in enumerate(questions[question]):
            question_line = {}
            question_line["id"] = question
            question_line["answer"] = unidecode(answers[question])
            question_line["sent"] = sent_offset
            question_line["text"] = unidecode(sent)
            o_questions.writerow(question_line)

        buzz_line = {}
        buzz_line["question"] = question
        buzz_line["sentence"] = ss
        buzz_line["word"] = tt
        buzz_line["page"] = guess
        buzz_line["final"] = 1
        buzz_line["weight"] = 1.0
        o_buzz.writerow(buzz_line)

        final_line = {}
        final_line["question"] = question
Beispiel #60
0
def slugify(text, delim=u'-'):
    """Generates an ASCII-only slug."""
    result = []
    for word in _punct_re.split(text.lower()):
        result.extend(unidecode(word).split())
    return unicode(delim.join(result))