def read_data(fp): articles = splitLexisNexis_AG(fp) dict1 = read_csvDictionary('/home/team3/Data/Dictionaries/us_states1.csv') dict2 = read_csvDictionary('/home/team3/Data/Dictionaries/us_states_shortname.csv') dict3 = read_csvDictionary('/home/team3/Data/Dictionaries/inflow.csv') dict4 = read_csvDictionary('/home/team3/Data/Dictionaries/agriculture.csv') for currentArticle in articles: state_full = "" for state in dict1: if currentArticle.count(state)>0: state_full = state if state_full!= " " and state_full!="" and dict_sum(dict3,currentArticle)>0 and dict_sum(dict4,currentArticle)>0: money=re.findall(r'\$ [0-9]+,[0-9]+,[0-9]+|\$ [0-9]+,[0-9]+|\$ [0-9]+ million|\$ [0-9]+ billion|\$ ?[0.9]+.[0-9]+ million|\$ ?[0.9]+.[0-9]+ billion|[0.9]+.[0-9]+ million dollar|[0.9]+.[0-9]+ million dollar', currentArticle.lower()) if money!=[]: clean_send_to_DB(state_map[state_full],money) state_full = "" for state in dict2: if currentArticle.count(state)>0: state_full = state if state_full!= " " and state_full!="" and dict_sum(dict3,currentArticle)>0 and dict_sum(dict4,currentArticle)>0: money=re.findall(r'\$ [0-9]+,[0-9]+,[0-9]+|\$ [0-9]+,[0-9]+|\$ [0-9]+ million|\$ [0-9]+ billion|\$ ?[0.9]+.[0-9]+ million|\$ ?[0.9]+.[0-9]+ billion|[0.9]+.[0-9]+ million dollar|[0.9]+.[0-9]+ million dollar', currentArticle.lower()) if money!=[]: clean_send_to_DB(state_full,money)
def read_data(fp): articles = splitLexisNexis_AG(fp) dict1 = read_csvDictionary('/home/team3/Data/Dictionaries/us_states1.csv') dict2 = read_csvDictionary( '/home/team3/Data/Dictionaries/us_states_shortname.csv') dict3 = read_csvDictionary( '/home/team3/Data/Dictionaries/Project List.csv') dict4 = [] for project in dict3: dict4.append(project.rstrip()) for currentArticle in articles: state_full, project_name, m = "", "", "" for state in dict1: if currentArticle.count(state) > 0: state_full = state project_name, m = "", "" for project in dict4: if currentArticle.count(project) > 0: #print(state,project) project_name = project #Regular Expresion for finding Money money = re.findall( r'\$ ?[0-9]+,[0-9]+,[0-9]+|\$ ?[0-9]+,[0-9]+|\$ ?[0-9]+ million|\$ ?[0-9]+ million dollar|million-dollar\$ ?[0.9]+.[0-9]+ million|\$ [0.9]+.[0-9]+ million dollar|[0.9]+.[0-9]+ million dollar', currentArticle.lower()) if money != []: clean_send_to_DB(state_map[state_full], project_name, money) state_full, project_name, m = "", "", "" for state in dict2: if currentArticle.count(state) > 0: state_full = state project_name, m = "", "" for project in dict4: if currentArticle.count(project) > 0: #print(state,project) project_name = project #Regular Expresion for finding Money money = re.findall( r'\$ ?[0-9]+,[0-9]+,[0-9]+|\$ ?[0-9]+,[0-9]+|\$ ?[0-9]+ million|\$ ?[0-9]+ billion|million dollar|billion dollar|million-dollar|billion-dollar|\$ ?[0.9]+.[0-9]+ million|\$ ?[0.9]+.[0-9]+ billion|[0.9]+.[0-9]+ million dollar|[0.9]+.[0-9]+ million dollar', currentArticle.lower()) if money != []: clean_send_to_DB(state_full, project_name, money) """if state_full!="" and project_name!="" and m!="": print(state_full,project_name,m) if state_full!="": for project in dict4: if currentArticle.count(project)>0: #print(state,project) project_name=project break m=re.findall(r'\$ ?[0-9]+,[0-9]+,[0-9]+|\$ ?[0-9]+,[0-9]+|\$ ?[0-9]{3}|\$ ?[0-9]+ million|\$ ?[0-9]+ billion|million dollar|billion dollar|million-dollar|billion-dollar', currentArticle) #re.match(r'\d+(?:,\d+)?',currentArticle) if m!=[]: print(m)""" """if state_full!="" and project!="":
def read_data(fp): #for each article articles = splitLexisNexis_AG(fp) states = read_csvDictionary('/home/team3/Data/Dictionaries/us_states1.csv') states_short = read_csvDictionary('/home/team3/Data/Dictionaries/us_states_shortname.csv') agriculture = read_csvDictionary('/home/team3/Data/Dictionaries/agriculture.csv') crops = read_csvDictionary('/home/team3/Data/Dictionaries/crops.csv') result_type1 = {} result_type2 = {} for currentArticle in articles: #For full forms re_states = re.findall(r"(?=("+'|'.join(states)+r"))",currentArticle) for each_state in re_states: #replacing short forms if each_state in states: i = states.index(each_state) a = states_short[i] ag_count = len(re.findall(r"(?=("+'|'.join(agriculture)+r"))",currentArticle)) if ag_count > 0: ag = re.findall(r"(?=("+'|'.join(crops)+r"))",currentArticle) result_type1[a] = ag else: continue #For short forms re_states_short = re.findall(r"(?=("+'|'.join(states_short)+r"))",currentArticle) for each_state_short in re_states_short: ag_count_short = len(re.findall(r"(?=("+'|'.join(agriculture)+r"))",currentArticle)) if ag_count_short > 0: ag_short = re.findall(r"(?=("+'|'.join(crops)+r"))",currentArticle) result_type2[each_state_short] = ag_short else: continue '''result = {} for key in (result_type1.keys() | result_type2.keys()): if key in result_type1: result.setdefault(key, []).append(result_type1[key]) if key in result_type2: result.setdefault(key, []).append(result_type2[key])''' '''for k, v in result_type1.items(): print(k, v[0]) for k, v in result_type2.items(): print(k, v) return result_type1, result_type2''' #Inserting data into database try: conn = psycopg2.connect("dbname='team3' user='******' host='localhost' password='******'") cur = conn.cursor() for k, v in result_type1.items(): if len(v) != 0: for i in v: query = "INSERT into ftm.cropType values(%s,%s)" data = (k,i) cur.execute(query, data) conn.commit() conn.close() except: print("error")
def read_data(fp): #for each article articles = splitLexisNexis_AG(fp) states = read_csvDictionary('/home/team3/Data/Dictionaries/us_states1.csv') states_short = read_csvDictionary( '/home/team3/Data/Dictionaries/us_states_shortname.csv') corruption = read_csvDictionary( '/home/team3/Data/Dictionaries/corruption1.csv') agriculture = read_csvDictionary( '/home/team3/Data/Dictionaries/agriculture.csv') result1 = {} result2 = {} cor_count = 0 cor_count_short = 0 #regular expression for checking if corruption and getting count for currentArticle in articles: #For full forms re_states = re.findall(r"(?=(" + '|'.join(states) + r"))", currentArticle) for each_state in re_states: #replacing short forms if each_state in states: i = states.index(each_state) a = states_short[i] ag_count = len( re.findall(r"(?=(" + '|'.join(agriculture) + r"))", currentArticle)) if ag_count > 0: global cor_count cor_count = len( re.findall(r"(?=(" + '|'.join(corruption) + r"))", currentArticle)) #print(cor_count) if cor_count > 0: result1[a] = cor_count else: continue #For short forms re_states_short = re.findall(r"(?=(" + '|'.join(states_short) + r"))", currentArticle) for each_state_short in re_states_short: ag_count_short = len( re.findall(r"(?=(" + '|'.join(agriculture) + r"))", currentArticle)) if ag_count_short > 0: global cor_count_short cor_count_short = len( re.findall(r"(?=(" + '|'.join(corruption) + r"))", currentArticle)) #print(cor_count) if cor_count_short > 0: result2[each_state_short] = cor_count_short #To merge both dictionary values A = Counter(result1) B = Counter(result2) result = A + B '''for k, v in result.items(): print(k, v)''' return result