def parse(self, response): try: json_response = utils.simple_check(response) except Exception as e: logging.error("request_failed: %s"%e.message) raise scrapy.exceptions.CloseSpider("request_failed: %s"%e.message) result = {} result["gid"] = json_response["content"]["id"] result["section"] = json_response["content"]["sectionId"] result["headline"] = utils.remove_html(json_response["content"]["fields"]["headline"]) result["text"] = utils.remove_html(json_response["content"]["fields"]["body"]) result["wordcount"] = json_response["content"]["fields"]["wordcount"] yield result
def parse(self, response): try: json_response = utils.simple_check(response) except Exception as e: logging.error("request_failed: %s" % e.message) raise scrapy.exceptions.CloseSpider("request_failed: %s" % e.message) result = {} result["gid"] = json_response["content"]["id"] result["section"] = json_response["content"]["sectionId"] result["headline"] = utils.remove_html( json_response["content"]["fields"]["headline"]) result["text"] = utils.remove_html( json_response["content"]["fields"]["body"]) result["wordcount"] = json_response["content"]["fields"]["wordcount"] yield result
def pipeline(self, text): text = utils.remove_space(text) text = utils.remove_punct(text) text = utils.remove_contractions(text.lower(), contractions) text = utils.remove_url(text) text = utils.remove_html(text) text = utils.correct_spellings(text) return text
async def fix_entry(data): if data is None: return original_data = dict(data) if data.get('image') and isinstance(data['image'], str): data['image'] = data['image'].replace('imag.cf', 'i.matdoes.dev') if data.get('image') and isinstance(data.get('image'), str): data['image'] = await images.get_data(data['image']) elif data.get('image') and not data['image'].get('thumbnail_b64'): data['image'] = await images.get_data(data['image']['src']) if data != original_data: # print('updated', data['_id']) await entries_coll.update_one({'_id': data['_id']}, {'$set': data}) data['content'] = utils.fix_html(data['content']) if 'nohtml_content' not in data: data['nohtml_content'] = utils.remove_html(data['content']) return data
async def edit_entry(title, content, editor=None, unlisted=False, entry_id=None, image=None): t = datetime.now() title = title.strip() content = utils.fix_html(content) nohtml_content = utils.remove_html(content) new_data = { 'title': title, 'content': content, 'last_edited': t, 'nohtml_content': nohtml_content } if unlisted is not None: new_data['unlisted'] = unlisted if image is not None: new_data['image'] = { 'src': image } if not entry_id: entry_id = str(uuid.uuid4()) new_history_data = { 'author': editor, 'content': content, 'title': title, 'time': t, 'unlisted': unlisted, } if image is not None: new_history_data['image'] = { 'src': image } await entries_coll.update_one( {'_id': entry_id}, { '$set': new_data, '$push': { 'history': new_history_data } }, upsert=True ) return entry_id
async def create_response(entry_data, preview=False): if entry_data: entry_id = entry_data['_id'] title = entry_data['title'] content = entry_data.get('content', '[no content]') unlisted = entry_data.get('unlisted', False) image = entry_data.get('image') markdown = entry_data.get('markdown') no_html = entry_data.get('nohtml_content') content = await utils.before_show_text(content) markdown = utils.html_to_markdown(content) owner_id = entry_data.get('owner_id') else: return web.HTTPNotFound() url_title = utils.url_title(title) if preview: return { 'title': title, 'preview': utils.remove_html(content), 'html': content, 'id': entry_id, 'image': image } else: return { 'slug': url_title, 'id': entry_id, 'title': title, 'html': content, 'unlisted': unlisted, 'image': image, 'markdown': markdown, 'no_html': no_html, 'owner_id': owner_id, }
def get_recos(): if request.method == "GET": #obtain user input user_desc = request.args['user_desc'] num_recs = int(request.args['n']) elif request.method == 'POST': user_desc = request.form['user_desc'] num_recs = int(request.form['n']) sim_prod = get_similar_products(compute_user_input_embedding(preproc_user_input(user_desc, model), model),catalog_embeddings, num_recs) id_list = map(lambda tup: tup[0], sim_prod) recos = serve_recos(id_list, catalog) cleaned_recos = [remove_html(reco) for reco in recos] results = [] for i in range(len(cleaned_recos)): d = { 'rank': i + 1, 'prod_desc': cleaned_recos[i] } results.append(d) return jsonify(results)
def csv_export(self, request, qs=None, raw_headers=None): response = HttpResponse(mimetype='text/csv') response['Content-Disposition'] = 'attachment; filename=%s.csv' \ % slugify(self.model.__name__) if raw_headers is None: raw_headers = self.get_csv_raw_headers(request) def get_attr(obj, name, as_name=False): """ Dereferences "__" delimited variable names. """ parts = name.split('__') cursor = obj for part in parts: name = part cursor = getattr(cursor, part, None) if callable(cursor): cursor = cursor() if cursor == obj: return if as_name: return name return cursor # Write header. header_data = {} fieldnames = [] header_names = self.get_csv_header_names(request) # Write records. first = True qs = self.get_csv_queryset(request, qs) for r in qs[:self.csv_record_limit]: if first: first = False if not raw_headers: if self.csv_headers_all and isinstance(r, dict): if isinstance(qs, utils.DictCursor): raw_headers = qs.field_order else: raw_headers = r.keys() else: raise Exception('No headers specified.') for name in raw_headers: if name in header_names: name_key = name header_data[name] = header_names.get(name_key) elif callable(name): # This is likely a Formatter instance. name_key = name.name header_data[name_key] = name.short_description elif isinstance(name, (tuple, list)) and len(name) == 2: name_key, name_key_verbose = name header_data[name_key] = name_key_verbose elif isinstance(name, basestring) and hasattr(self, name): # This is likely a ModelAdmin method name. name_key = name header_data[name_key] = getattr(self, name).short_description elif hasattr(name, 'short_description'): name_key = name header_data[name_key] = getattr( name, 'short_description') elif hasattr(self.model, name): name_key = name if hasattr(getattr(self.model, name), 'short_description'): header_data[name_key] = getattr(getattr(self.model, name), 'short_description') else: header_data[name_key] = name else: name_key = name header_data[name_key] = name_key#get_attr(r, name, as_name=True) # field = self.model._meta.get_field_by_name(name) # if field and field[0].verbose_name: # header_data[name_key] = field[0].verbose_name # else: # header_data[name_key] = name header_data[name_key] = header_data[name_key].title() fieldnames.append(name_key) writer = csv.DictWriter( response, fieldnames=fieldnames, quoting=self.csv_quoting) writer.writerow(header_data) #print('fieldnames:',fieldnames data = {} for name in raw_headers: obj = None if isinstance(r, dict): if name in r: data[name] = r[name] # print('skipping:',name continue # elif 'id' in r: # obj = self.model.objects.get(id=r['id']) # print('model:',self.model # print('r:',r # print('name:',name,isinstance(name, basestring) and hasattr(r, name) if callable(name): # This is likely a Formatter instance. name_key = name.name if hasattr(name, 'plaintext'): data[name_key] = to_ascii(name(r, plaintext=True)) else: data[name_key] = to_ascii(name(r)) elif isinstance(name, (tuple, list)) and len(name) == 2: name_key, name_key_verbose = name if hasattr(self, name_key): data[name_key] = to_ascii(getattr(self, name_key)) else: data[name_key] = to_ascii(getattr(r, name_key)) elif isinstance(name, basestring) and hasattr(self, name): # This is likely a ModelAdmin method name. name_key = name data[name_key] = to_ascii(getattr(self, name)(r)) elif isinstance(name, basestring) and hasattr(r, name): name_key = name data[name_key] = to_ascii(getattr(r, name)) else: name_key = name data[name_key] = to_ascii(get_attr(r, name)) if callable(data[name_key]): data[name_key] = to_ascii(data[name_key]()) if self.csv_remove_html: data[name_key] = utils.remove_html(data[name_key]) #print('data:',data writer.writerow(data) return response
def score(self, text, tests=["TOXICITY"], context=None, languages=None, do_not_store=False, token=None, text_type=None): # data validation # make sure it's a valid test # TODO: see if an endpoint that has valid types exists if isinstance(tests, str): tests = [tests] if not isinstance(tests, (list, dict)) or tests is None: raise ValueError("Invalid list/dictionary provided for tests") if isinstance(tests, list): new_data = {} for test in tests: new_data[test] = {} tests = new_data if text_type: if text_type.lower() == "html": text = remove_html(text) elif text_type.lower() == "md": text = remove_html(text, md=True) else: raise ValueError( "{0} is not a valid text_type. Valid options are 'html' or 'md'" .format(str(text_type))) for test in tests.keys(): if test not in allowed: warnings.warn( "{0} might not be accepted as a valid test.".format( str(test))) for key in tests[test].keys(): if key not in ["scoreType", "scoreThreshhold"]: raise ValueError( "{0} is not a valid sub-property for {1}".format( key, test)) # The API will only grade text less than 3k characters long if len(text) > 3000: # TODO: allow disassembly/reassembly of >3000char comments warnings.warn( "Perspective only allows 3000 character strings. Only the first 3000 characters will be sent for processing" ) text = text[:3000] new_langs = [] if languages: for language in languages: language = language.lower() if validate_language(language): new_langs.append(language) # packaging data url = Perspective.base_url + "/comments:analyze" querystring = {"key": self.key} payload_data = {"comment": {"text": text}, "requestedAttributes": {}} for test in tests.keys(): payload_data["requestedAttributes"][test] = tests[test] if new_langs != None: payload_data["languages"] = new_langs if do_not_store: payload_data["doNotStore"] = do_not_store payload = json.dumps(payload_data) headers = {'content-type': "application/json"} response = requests.post(url, data=payload, headers=headers, params=querystring) data = response.json() if "error" in data.keys(): raise PerspectiveAPIException(data["error"]["message"]) c = Comment(text, [], token) base = data["attributeScores"] for test in tests.keys(): score = base[test]["summaryScore"]["value"] score_type = base[test]["summaryScore"]["type"] a = Attribute(test, [], score, score_type) for span in base[test]["spanScores"]: beginning = span["begin"] end = span["end"] score = span["score"]["value"] score_type = span["score"]["type"] s = Span(beginning, end, score, score_type, c) a.spans.append(s) c.attributes.append(a) return c
def csv_export(self, request, qs=None, raw_headers=None): response = HttpResponse(mimetype='text/csv') response['Content-Disposition'] = 'attachment; filename=%s.csv' \ % slugify(self.model.__name__) if raw_headers is None: raw_headers = self.get_csv_raw_headers(request) def get_attr(obj, name, as_name=False): """ Dereferences "__" delimited variable names. """ parts = name.split('__') cursor = obj for part in parts: name = part cursor = getattr(cursor, part, None) if callable(cursor): cursor = cursor() if cursor == obj: return if as_name: return name return cursor # Write header. header_data = {} fieldnames = [] header_names = self.get_csv_header_names(request) # Write records. first = True qs = self.get_csv_queryset(request, qs) for r in qs[:self.csv_record_limit]: if first: first = False if not raw_headers: if self.csv_headers_all and isinstance(r, dict): if isinstance(qs, utils.DictCursor): raw_headers = qs.field_order else: raw_headers = r.keys() else: raise Exception('No headers specified.') for name in raw_headers: if name in header_names: name_key = name header_data[name] = header_names.get(name_key) elif callable(name): # This is likely a Formatter instance. name_key = name.name header_data[name_key] = name.short_description elif isinstance(name, (tuple, list)) and len(name) == 2: name_key, name_key_verbose = name header_data[name_key] = name_key_verbose elif isinstance(name, basestring) and hasattr(self, name): # This is likely a ModelAdmin method name. name_key = name header_data[name_key] = getattr(self, name).short_description elif hasattr(name, 'short_description'): name_key = name header_data[name_key] = getattr( name, 'short_description') elif hasattr(self.model, name): name_key = name if hasattr(getattr(self.model, name), 'short_description'): header_data[name_key] = getattr( getattr(self.model, name), 'short_description') else: header_data[name_key] = name else: name_key = name header_data[ name_key] = name_key #get_attr(r, name, as_name=True) # field = self.model._meta.get_field_by_name(name) # if field and field[0].verbose_name: # header_data[name_key] = field[0].verbose_name # else: # header_data[name_key] = name header_data[name_key] = header_data[name_key].title() fieldnames.append(name_key) writer = csv.DictWriter(response, fieldnames=fieldnames, quoting=self.csv_quoting) writer.writerow(header_data) #print('fieldnames:',fieldnames data = {} for name in raw_headers: obj = None if isinstance(r, dict): if name in r: data[name] = r[name] # print('skipping:',name continue # elif 'id' in r: # obj = self.model.objects.get(id=r['id']) # print('model:',self.model # print('r:',r # print('name:',name,isinstance(name, basestring) and hasattr(r, name) if callable(name): # This is likely a Formatter instance. name_key = name.name if hasattr(name, 'plaintext'): data[name_key] = to_ascii(name(r, plaintext=True)) else: data[name_key] = to_ascii(name(r)) elif isinstance(name, (tuple, list)) and len(name) == 2: name_key, name_key_verbose = name if hasattr(self, name_key): data[name_key] = to_ascii(getattr(self, name_key)) else: data[name_key] = to_ascii(getattr(r, name_key)) elif isinstance(name, basestring) and hasattr(self, name): # This is likely a ModelAdmin method name. name_key = name data[name_key] = to_ascii(getattr(self, name)(r)) elif isinstance(name, basestring) and hasattr(r, name): name_key = name data[name_key] = to_ascii(getattr(r, name)) else: name_key = name data[name_key] = to_ascii(get_attr(r, name)) if callable(data[name_key]): data[name_key] = to_ascii(data[name_key]()) if self.csv_remove_html: data[name_key] = utils.remove_html(data[name_key]) #print('data:',data writer.writerow(data) return response