def create_noun(html_full_noun): data = scrap_noun(html_full_noun) main_term = scrapper.ascii_2_portuguese(data[0][0].split(" - ")[0]) gender = scrapper.ascii_2_portuguese( data[0][0].split(" - ")[1]).split(" ")[-1] divisao_silabica = scrapper.ascii_2_portuguese(data[1].replace("·", "-")) plural = data[0][-1].split(" ")[-1] query = "CREATE (:Substantivo {nid:\"%s\", divisao_silabica:\"%s\", genero:\"%s\", plural:\"%s\"" % ( main_term, divisao_silabica, gender, plural) query += "});" db_utils.execute_query(query=query) return query
def create_verb(html_full_verb): main_verb_token = regex.findall(r"<h1>.+?</h1>", html_full_verb) main_verb_token = regex.sub(r"<.+?>", "", main_verb_token[0]) main_term = str(main_verb_token).split(" -")[0].strip(" ") verb_data = scrap_verb(html_full_verb) data_len = len(verb_data[0]) divisao_silabica = verb_data[2].replace("·", "-") query = "CREATE (:Verbo {nid:\"" + main_term + "\", divisao_silabica:\"" + divisao_silabica + "\", " for vvv in range(0, data_len): if str(vvv) in verb_structure.keys(): data = verb_data[0][vvv] if vvv == 31: data = scrapper.ascii_2_portuguese(verb_data[1].replace( ":::", ",")) query += verb_structure[str(vvv)] + ":[\"" + data.replace( "Pessoal:::", "").replace(":::", "\",\"").replace( " ", "") + "\"]" else: query += verb_structure[str(vvv)] + ":[\"" + data.replace( "Pessoal:::", "").replace(":::", "\",\"").replace( " ", "") + "\"], " query += "});" db_utils.execute_query(query=query) return query
def scrap_noun(html_full_noun_scrap): temp_data = scrap_general(html_full_noun_scrap) clean_data = list() for line in temp_data[0]: line = line.replace(":::", " ").replace(" : ", " ").strip(" ") if "" == line.strip(" ") or '' == line: continue clean_data.append(line) divisao_silabica = temp_data[1] divisao_silabica = regex.sub(r"<.+?>", "", scrapper.ascii_2_portuguese(divisao_silabica)) return clean_data, divisao_silabica
def scrap_verb(html_full_verb_scrap): html_full_verb_scrap = html_full_verb_scrap.split("name=maintext")[1] html_full_verb_scrap = html_full_verb_scrap.split("</table>")[0] html_full_verb_scrap = filter_funcs.replace_html_entities( html_full_verb_scrap) divisao_silabica = regex.findall(r"<p.+?Divisão silábica.+?</p>", html_full_verb_scrap) html_full_verb_scrap = html_full_verb_scrap.replace("\t", "").replace( "\r", "")[1:].strip(" ") html_full_verb_scrap = regex.sub(r"<p.+?Divisão silábica.+?</p>", "", html_full_verb_scrap) participio_passado = html_full_verb_scrap.split("Particípio passado")[1] participio_passado = regex.sub(r"</?t[dr]>", " ", participio_passado) participio_passado = regex.sub(r"<.+?>", " ", participio_passado) participio_passado = participio_passado.replace(" / ", "/") participio_passado = participio_passado.replace(" / ", "/") participio_passado = participio_passado.strip(" ") while " " in participio_passado: participio_passado = participio_passado.replace(" ", " ") while "\n\n" in participio_passado: participio_passado = participio_passado.replace("\n\n", "") participio_passado = participio_passado.strip(" ") participio_passado = participio_passado.replace(" ", ":::") while " " in html_full_verb_scrap: html_full_verb_scrap = html_full_verb_scrap.replace(" ", " ") while "\n\n" in html_full_verb_scrap: html_full_verb_scrap = html_full_verb_scrap.replace("\n\n", "\n") html_full_verb_scrap = regex.sub(r"<br>", ":::", html_full_verb_scrap) html_full_verb_scrap = regex.sub(r" / ", "/", html_full_verb_scrap) html_full_verb_scrap = regex.sub(r" / ", "/", html_full_verb_scrap) html_full_verb_scrap = html_full_verb_scrap.replace(r" / ", "/") html_full_verb_scrap = html_full_verb_scrap.replace(r" / ", "/") content = regex.sub(r"<.+?>", "", html_full_verb_scrap) temp_data = content.split("\n") clean_data = list() for line in temp_data: if "" == line.strip(" "): continue clean_data.append(line.strip(" ")) divisao_silabica = divisao_silabica[0] divisao_silabica = regex.sub(r"<.+?>", "", scrapper.ascii_2_portuguese(divisao_silabica)) return clean_data, participio_passado, divisao_silabica
def scrap_adverb(html_full_adverb_scrap): temp_data = scrap_general(html_full_adverb_scrap) clean_data = dict() for line in temp_data[0]: line = line.replace(":::", " ").replace(" : ", " ").strip(" ") if "" == line.strip(" ") or '' == line: continue while " " in line: line = line.replace(" ", " ") if " - advérbio" in line: clean_data["lemma"] = line.split(" ")[0] continue elif "Destaques" in line: break else: clean_data["mais_informacoes"] = line divisao_silabica = temp_data[1] clean_data["divisao_silabica"] = regex.sub( r"<.+?>", "", scrapper.ascii_2_portuguese(divisao_silabica)) return clean_data
def scrap_adjective(html_full_adjective_scrap): temp_data = scrap_general(html_full_adjective_scrap) clean_data = dict() for line in temp_data[0]: line = line.replace(":::", " ").replace(" : ", " ").strip(" ") if "" == line.strip(" ") or '' == line or "Masculino Feminino" in line: continue elif " - adjetivo" in line: clean_data["lemma"] = line.split(" ")[0] elif "Singular" in line: clean_data["sing_masc"] = line.split(" ")[1] clean_data["sing_femi"] = line.split(" ")[2] elif "Plural" in line: clean_data["plur_masc"] = line.split(" ")[1] clean_data["plur_femi"] = line.split(" ")[2] else: clean_data["more_info"] = line divisao_silabica = temp_data[1] clean_data["divisao_silabica"] = regex.sub( r"<.+?>", "", scrapper.ascii_2_portuguese(divisao_silabica)) return clean_data