Ejemplos de clean_data_from_html en Python, ejemplos de gre_words_scripts.utils.clean_data_from_html en Python

Ejemplo n.º 1

0

Mostrar archivo

Archivo: testdel_googe_scrap.py Proyecto: KaleidoscopeIM/Universal-Word-Scrapper

def get_sentence(word_input):

    found=False
    sentence=""
    for obj in google_json_object:
        google_word = clean_data_from_html(obj["WORD"])
        if google_word == word_input:
            if obj["SENTENCE"] != "" and obj["SENTENCE"] != "ga_non" and obj["SENTENCE"] is not None and obj["SENTENCE"] != "":
                sentence = obj["SENTENCE"]
                found = True
            break
    if found:
        print("sentence from google data..:"+sentence)
        return sentence
    else:
        for obj in your_dictionary_json_object:
            your_dict_word=clean_data_from_html(obj["WORD"])
            if your_dict_word == word_input:
                if obj["SENTENCE"] != "" and obj["SENTENCE"] != "ga_non" and obj["SENTENCE"] is not None and obj["SENTENCE"] != "":
                    sentence = obj["SENTENCE"][0]
                    found = True
                break
    if found:
        print("sentence from your dictionary.."+ sentence)
        return sentence
    else:
        print("sentence not found for word....: "+word_input)
        return ""

Ejemplo n.º 2

0

Mostrar archivo

Archivo: testdel_googe_scrap.py Proyecto: KaleidoscopeIM/Universal-Word-Scrapper

def fetch_word_details():

    initialize_variables("OPEN")
    word = "Bedizen"
    find_id = random.randint(10000, 20000)
    final_type = get_type(word)
    final_meaning = get_meaning(word)
    final_sentence = clean_data_from_html(get_sentence(word))
    final_synonyms = get_synonyms(word)
    final_antonyms = get_antonyms(word)
    final_attr1 = clean_data_from_html(get_attr1(word))

    word_string = ""
    word_string = word_string+"{\"ID\":"+str(find_id)+","
    word_string = word_string+"\"WORD\":\""+word+"\","
    word_string = word_string + "\"TYPE\":\"" + final_type + "\","
    word_string = word_string + "\"MEANING\":\"" + final_meaning + "\","
    word_string = word_string + "\"SENTENCE\":\"" + final_sentence + "\","
    word_string = word_string + "\"SYNONYMS\":\"" + final_synonyms + "\","
    word_string = word_string + "\"ANTONYMS\":\"" + final_antonyms + "\","
    word_string = word_string + "\"ATTR1\":\"" + final_attr1 + "\","
    word_string = word_string + "\"LINK\":\"\","
    word_string = word_string + "\"ATTR2\":\"\"}"

    print("final word string..:"+word_string)

    write_with_remove_redundancy(word_string,word)

    initialize_variables("CLOSE")

Ejemplo n.º 3

0

Mostrar archivo

def json_creator_barrons():

    print("creating barron 333 and 800 ..:")
    filein = open(
        "D:\Video Work Area\GRE WORDS APP\data\production\WordsList.json", "r")
    filein_obj = json.load(filein)["words"]

    barron800_words_in = open(
        "D:\Video Work Area\GRE WORDS APP\data\Barron800WordsList.txt", "r")
    barron800_json_out = open(
        "D:\Video Work Area\GRE WORDS APP\data\production\Barron800.json", "w")
    barron800_json_out.seek(0)
    barron800_json_out.truncate()

    barron333_words_in = open(
        "D:\Video Work Area\GRE WORDS APP\data\Barron333WordsList.txt", "r")
    barron333_json_out = open(
        "D:\Video Work Area\GRE WORDS APP\data\production\Barron333.json", "w")
    barron333_json_out.seek(0)
    barron333_json_out.truncate()

    barron800_final_string = "{\"BARRON800\":["
    barron333_final_string = "{\"BARRON333\":["

    for obj in filein_obj:
        word = clean_data_from_html(obj["WORD"])
        barron800_words_in.seek(0)
        barron333_words_in.seek(0)
        wid = obj["ID"]
        for barron800_word in barron800_words_in:
            if word == clean_data_from_html(barron800_word):
                barron800_final_string = barron800_final_string + "{\"BARRONID\":" + str(
                    wid) + "},"
                break
        for barron333_word in barron333_words_in:
            if word == clean_data_from_html(barron333_word):
                barron333_final_string = barron333_final_string + "{\"BARRONID\":" + str(
                    wid) + "},"
                break

    barron333_final_string = barron333_final_string[:-1]
    barron800_final_string = barron800_final_string[:-1]

    barron333_final_string = barron333_final_string + "]}"
    barron800_final_string = barron800_final_string + "]}"

    print("generated barron 333 string..:" + barron333_final_string)
    print("generated barron 800 string..:" + barron800_final_string)

    barron333_json_out.write(barron333_final_string)
    barron800_json_out.write(barron800_final_string)
    barron333_json_out.flush()
    barron800_json_out.flush()

    barron800_json_out.close()
    barron333_json_out.close()
    barron800_words_in.close()
    barron333_words_in.close()

Ejemplo n.º 4

0

Mostrar archivo

Archivo: find_word_details.py Proyecto: KaleidoscopeIM/Universal-Word-Scrapper

def write_with_remove_redundancy(word_string, word):

    file = open(
        r"D:\Video Work Area\GRE WORDS APP\data\scrapped\WordsList.json", "r+")
    redundant = False
    redundant_count = 1
    all_obj = json.load(file)["words"]
    for obj in all_obj:
        if word == obj["WORD"] and obj["WORD"] != "":
            redundant = True
            if redundant_count == 1:
                print(
                    "Word details was added in file.. updating values in temp word list file"
                )
                obj["TYPE"] = get_type(word)
                obj["MEANING"] = get_meaning(word)
                obj["SENTENCE"] = clean_data_from_html(get_sentence(word))
                obj["SYNONYMS"] = get_synonyms(word)
                obj["ANTONYMS"] = get_antonyms(word)
                obj["ATTR1"] = clean_data_from_html(get_attr1(word))
                redundant_count = redundant_count + 1
            else:
                print(
                    "Found multiple entries in temp word details .. "
                    "updating values to empty all data.....total redundancy..:"
                    + str(redundant_count))
                obj["WORD"] = ""
                obj["TYPE"] = ""
                obj["MEANING"] = ""
                obj["SENTENCE"] = ""
                obj["SYNONYMS"] = ""
                obj["ANTONYMS"] = ""
                obj["ATTR1"] = ""

    if redundant:
        file.seek(0)
        file.truncate()
        file.write("{\"words\":")
        json.dump(all_obj, file)
        file.write("}")
        file.flush()
        file.close()
    else:
        file.truncate((file.tell()) - 2)
        file.write(",")
        file.write(word_string)

        file.write("]}")
        file.flush()
        file.close()

Ejemplo n.º 5

0

Mostrar archivo

Archivo: scrap_word_details.py Proyecto: KaleidoscopeIM/Universal-Word-Scrapper

def marriamwebsterScrap(site, word):
    marriamDicObj = {}
    marriamDicObj["WORD"] = word
    marriamwebsterFinalSite = site + word
    marriamDicObj["SITE"] = marriamwebsterFinalSite
    marriamwebsterContent = requests.get(marriamwebsterFinalSite,
                                         headers=header).text
    marriamSoup = BeautifulSoup(marriamwebsterContent, features="html.parser")
    # print(marriamSoup.prettify())
    synm_comin = False
    antm_coming = False

    try:
        mariamSynmDiv = marriamSoup.find_all(
            "div", {"class": "thesaurus-synonyms-module-anchor"})[0]
        pTagAll = mariamSynmDiv.findAll('p')
        for pTag in pTagAll:
            if synm_comin:
                marriamDicObj["SYNONYMS"] = clean_data_from_html(pTag.text)
                synm_comin = False
            if clean_data_from_html(pTag.text) == "Synonyms":
                synm_comin = True
            if antm_coming:
                marriamDicObj["ANTONYMS"] = clean_data_from_html(pTag.text)
                antm_coming = False
            if clean_data_from_html(pTag.text) == "Antonyms":
                antm_coming = True

    except Exception as ex:
        marriamDicObj["SYNONYMS"] = "ga_non"
        marriamDicObj["SYNONYMS"] = "ga_non"

    if "SYNONYMS" not in marriamDicObj:
        marriamDicObj["SYNONYMS"] = "ga_non"
    if "ANTONYMS" not in marriamDicObj:
        marriamDicObj["ANTONYMS"] = "ga_non"

    try:
        citeexamples = marriamSoup.find_all("div",
                                            {"class": "in-sentences"})[0]
        citespan = citeexamples.findAll('span')
        citeexamplesList = []
        for span in citespan:
            citeexamplesList.append(clean_data_from_html(span.text))
            break
        marriamDicObj["SENTENCE"] = citeexamplesList
    except Exception as ex:
        marriamDicObj["SENTENCE"] = "ga_non"
    return marriamDicObj

Ejemplo n.º 6

0

Mostrar archivo

Archivo: testdel_googe_scrap.py Proyecto: KaleidoscopeIM/Universal-Word-Scrapper

def get_meaning(word_input):

    found = False
    meaning = ""
    for obj in vocab_json_object:
        vocab_word = clean_data_from_html(obj["WORD"])
        if vocab_word == word_input:
            found = True
            meaning = obj["MEANING"]
            break

    if found:
        print("meaning found in vocab dictionary : " + meaning)
        return meaning
    else:
        dict_meaning = dictionary.meaning(word_input)
        if dict_meaning is not None:
            for key in dict_meaning:
                for m in dict_meaning.get(key):
                    found = True
                    #meaning = meaning+m+" OR "
                    meaning = m
                    break
    if found:
        # meaning=meaning[:-4]
        print("meaning found in pyDictionary..:" + meaning)
        return meaning
    else:
        print("error.. meaning not found for...." + word_input)
        return ""

Ejemplo n.º 7

0

Mostrar archivo

Archivo: testdel_googe_scrap.py Proyecto: KaleidoscopeIM/Universal-Word-Scrapper

def get_antonyms(word_input):
    antonyms = ""
    found = False
    for obj in google_json_object:
        google_word = clean_data_from_html(obj["WORD"])
        if google_word == word_input:
            if obj["ANTONYMS"] != "" and obj["ANTONYMS"] != "ga_non" and obj["ANTONYMS"] is not None and obj["ANTONYMS"] != "":
                antonyms = obj["ANTONYMS"]
                found = True
            break
    if found:
        print("antonyms from google..:" + antonyms)
        return antonyms
    else:
        for obj in marrian_json_object:
            marrian_word = obj["WORD"]
            if marrian_word == word_input:
                if obj["ANTONYMS"] != "" and obj["ANTONYMS"] != "ga_non" and obj["ANTONYMS"] is not None and obj["ANTONYMS"] != "":
                    antonyms = obj["ANTONYMS"]
                    found = True
                break
    if found:
        print("antonyms from marriam dictionary..:" + antonyms)
        return antonyms
    else:
        print("Antonyms not found for ...:" + word_input+".... returning empty")
        return antonyms

Ejemplo n.º 8

0

Mostrar archivo

Archivo: find_word_details.py Proyecto: KaleidoscopeIM/Universal-Word-Scrapper

def get_synonyms(word_input):
    synonyms = ""
    found = False
    for obj in google_json_object:
        google_word = clean_data_from_html(obj["WORD"])
        if google_word == word_input:
            if obj["SYNONYMS"] != "" and obj["SYNONYMS"] != "ga_non" and obj[
                    "SYNONYMS"] is not None and obj["SYNONYMS"] != "":
                synonyms = obj["SYNONYMS"]
                found = True
            break
    if found:
        print("synonyms from google..:" + synonyms)
        return synonyms
    else:
        for obj in marrian_json_object:
            marrian_word = obj["WORD"]
            if marrian_word == word_input:
                if obj["SYNONYMS"] != "" and obj[
                        "SYNONYMS"] != "ga_non" and obj[
                            "SYNONYMS"] is not None and obj["SYNONYMS"] != "":
                    synonyms = obj["SYNONYMS"]
                    found = True
                break
    if found:
        print("synonyms from marriam dictionary..:" + synonyms)
        return synonyms
    else:
        for obj in vocab_json_object:
            vocab_word = obj["WORD"]
            if vocab_word == word_input:
                if obj["SYNONYMS"] != "" and obj[
                        "SYNONYMS"] != "ga_non" and obj[
                            "SYNONYMS"] is not None and obj["SYNONYMS"] != "":
                    synonyms = obj["SYNONYMS"]
                    found = True
                break
    if found:
        print("synonyms from vocab dictionary..:" + synonyms)
        return synonyms
    else:
        print("not found for ...:" + word_input)
        return synonyms

Ejemplo n.º 9

0

Mostrar archivo

Archivo: scrap_word_details.py Proyecto: KaleidoscopeIM/Universal-Word-Scrapper

def collinScrap(site, word):

    collinDicObj = {}
    collinDicObj["WORD"] = word
    collingsDictionaryFinalURL = site + word
    collinDicObj["SITE"] = collingsDictionaryFinalURL
    collinDictionaryContent = requests.get(collingsDictionaryFinalURL,
                                           headers=header).text
    collingDictionarySoup = BeautifulSoup(collinDictionaryContent,
                                          features="html.parser")

    try:
        collingDescription = collingDictionarySoup.find_all(
            "div", {"class": "def"})[0].text
        collingDescription = clean_data_from_html(collingDescription)
        collinDicObj["ATTR1"] = collingDescription
    except Exception as ex:
        collinDicObj["ATTR1"] = "ga_non"

    return collinDicObj

Ejemplo n.º 10

0

Mostrar archivo

Archivo: scrap_word_details.py Proyecto: KaleidoscopeIM/Universal-Word-Scrapper

def googleScrap(googleSite, word):
    googleDictionaryObj = {}
    googleDictionaryObj["WORD"] = clean_data_from_html(word)
    googleDefineFinalSite = googleSite + word
    #print("google final site:" + googleDefineFinalSite)
    googleDictionaryObj["SITE"] = googleDefineFinalSite
    try:
        page = requests.get(googleDefineFinalSite, headers=header).text
        googleDefineSoup = BeautifulSoup(page, features="html.parser")
        googleSentene = ""
        googleSentenceSkipFirst = True
        try:
            for div in googleDefineSoup.find_all(
                    lambda tag: tag.name == 'div' and tag.get('class'
                                                              ) == ['vk_gy']):
                if (googleSentenceSkipFirst):
                    googleSentenceSkipFirst = False
                else:
                    googleSentene += div.text
                    break
        except (IndexError, ValueError):
            googleSentene += "ga_non"
        googleDictionaryObj["SENTENCE"] = clean_data_from_html(googleSentene)

        googleSynms = ""
        removeSymnText = True
        try:
            for td in googleDefineSoup.find_all(
                    "table", {"class": "vk_tbl vk_gy"})[0].find_all('td'):
                if (removeSymnText):
                    removeSymnText = False
                else:
                    googleSynms += td.text
                    break
        except (IndexError, ValueError):
            googleSynms += "ga_non"
        googleSynms = clean_data_from_html(googleSynms)
        googleSynms = googleSynms.split(";")[0]
        googleDictionaryObj["SYNONYMS"] = googleSynms
        #print(googleSynms)

        googleAntonm = ""
        removeAntonymText = True
        try:
            for td in googleDefineSoup.find_all(
                    "table", {"class": "vk_tbl vk_gy"})[1].find_all('td'):
                if (removeAntonymText):
                    removeAntonymText = False
                else:
                    googleAntonm += td.text
                    break
        except (IndexError, ValueError):
            googleAntonm += "ga_non"
        googleAntonm = clean_data_from_html(googleAntonm)
        googleDictionaryObj["ANTONYMS"] = googleAntonm
        #print("Google Antonyms......::::::::::::"+googleAntonm)
        #print(googleAntonm)
    except Exception as e:
        print(
            "google connection attempt failed..host not responded with request"
        )
        #print(googleDefineSoup.prettify())
    return googleDictionaryObj

Ejemplo n.º 11

0

Mostrar archivo

Archivo: scrap_word_details.py Proyecto: KaleidoscopeIM/Universal-Word-Scrapper

def vocalbularySiteScrap(site, word):
    vocabularyDictionaryObj = {}
    vocabularyDictionaryObj["WORD"] = word
    vocabularySiteFinalUrl = site + word
    vocabularyDictionaryObj["SITE"] = vocabularySiteFinalUrl
    #print(vocabularySiteFinalUrl)
    try:
        vocabularySiteContent = urlopen(vocabularySiteFinalUrl)
        vocabularySiteSoup = BeautifulSoup(vocabularySiteContent,
                                           features="html.parser")
        countLine = 3
        counter = 1
        counter = 1
        vocanSiteMeaning = vocabularySiteSoup.find(
            lambda tag: tag.name == 'h3' and tag.get('class'
                                                     ) == ['definition'])
        for vocabMeaningSTR in vocanSiteMeaning:
            if (countLine == counter):
                #print(cleanDataFromHTML(vocabMeaningSTR))
                vocabMeaningSTR = clean_data_from_html(vocabMeaningSTR)
                break
            else:
                counter = counter + 1
    except Exception as ex:
        vocabMeaningSTR = "ga_non"
    vocabularyDictionaryObj["MEANING"] = vocabMeaningSTR

    try:
        VocabularySitedescription = vocabularySiteSoup.find(
            "meta", property="og:description")
        vocabularyDictionaryObj["ATTR1"] = (
            VocabularySitedescription["content"]
            if VocabularySitedescription else "ga_non")
        #print(VocabularySitedescription["content"] if VocabularySitedescription else "ga_non")
    except Exception as ex:
        vocabularyDictionaryObj["ATTR1"] = "ga_non"

    try:
        vocabularySentence = vocabularySiteSoup.find("div", {
            "class": "example"
        }).get_text()
        vocabularyDictionaryObj["SENTENCE"] = clean_data_from_html(
            vocabularySentence if vocabularySentence else "ga_non")
        #print(vocabularyDictionaryObj["SENTENCE"])
    except Exception as ex:
        vocabularyDictionaryObj["SENTENCE"] = "ga_non"

    vocabSynmsList = ""
    try:
        vocabularySynonyms = vocabularySiteSoup.findAll("a", {"class": "word"})
        for element in vocabularySynonyms:
            vocabSynmsList += element.text + ", "
        vocabSynmsList = vocabSynmsList[:-2]
        vocabularyDictionaryObj["SYNONYMS"] = vocabSynmsList
        #print(vocabSynmsList)
    except Exception as ex:
        vocabularyDictionaryObj["SYNONYMS"] = "ga_non"

    #print(vocabularySiteSoup.prettify())

    return vocabularyDictionaryObj

Ejemplo n.º 12

0

Mostrar archivo

Archivo: init_script.py Proyecto: KaleidoscopeIM/Universal-Word-Scrapper

from gre_words_scripts.utils import init_files
import time
import random

all_words = open("D:\Video Work Area\GRE WORDS APP\data\\all_words.txt", "r")
words_list = open(
    "D:\Video Work Area\GRE WORDS APP\data\production\WordsList.json", "r")

# init_files()  # >>>>>>>>>>>> reset values most imp method <<<<<<<<<<<<<<<<<

words_list_dict = json.load(words_list)["words"]

count = 1
for word in all_words:

    word = clean_data_from_html(word)

    count = count + 1

    exists = False
    for word_obj in words_list_dict:
        if word_obj["WORD"] == word:
            exists = True
            break
    if exists is False:
        print("\n\n\n     >>>>>>>>>>>>>>>>>>>>>>>>>>>      Processing ..: " +
              word + "        ::" + str(count) +
              "            Percent Done..:" + str((int((count / 801) * 100))) +
              "%   <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<\n\n")

        # get word details and store them in scrapper folder