def index_relevant_words(sentences,file_path): #returns True if relevant words are found, else returns False flag = False for sentence_index,sentence in enumerate(sentences): words = get_words_from_sentence(sentence) for word in words: if (word in wordlist and is_relevant_sentence(sentence,word)): add_reference(word,file_path+"::{}".format(sentence_index)) print (word,sentence) flag = True return flag if __name__ == "__main__": now = datetime.datetime.now() global_data = get_global_data() wordlist = get_wordlist(json_format=True) links = get_rss_feeds() for link in links: print (link) r = requests.get(link) text = scrape_webiste(link) if (not is_text(text)): continue sentences = get_sentences(text) file_name = "{}-{}-{}::{}".format(now.day,now.month,now.year,global_data['articles']['count']) global_data['articles']['count'] = global_data['articles']['count'] + 1 file_path = os.path.join(global_data['articles']['storage_path'],file_name) with open(file_path,'w+') as file: file.write(text) words_indexed = index_relevant_words(text,file_path) if (not words_indexed):
import json from global_functions import get_global_data,get_wordlist def ovveride_refs(wordlist): data = [{} for i in range(0,26)] for word in wordlist: data[ord(word[0])-97][word] = [] char = 'a' for lis in data: try: with open("{}/{}_words.json".format(global_data['json_folder_path'],char),"w+") as file: json.dump(lis,file,indent=4) except: return False char = chr(ord(char)+1) return True if __name__ == "__main__": global_data = get_global_data() wordlist = get_wordlist() result = ovveride_refs(wordlist) if (result): print ("Overrode all references to all words") else: print ("Unable to override references, data might be partially overrode!")