def create_post_from(file_path): p = Post() tokens = file_path.split('/')[-1].split('_') p.date = tokens[1] p.value = tokens[2].split('.')[0].replace(' ', '') p.source = 'jbtalks' return p
def read_title(file_path): file_path = file_path.split('/')[-1] if (not 'lowyat' in file_path): return Post() p = Post() p.date = file_path.split('_')[1] p.value = str.lower(file_path.split('_')[3].split('.')[0]) p.origin = file_path p.source = 'lowyat' return p
def read_title(file_path): file_path = file_path.split('/')[-1] if (not 'carinet' in file_path): return Post() p = Post() p.date = file_path.split('_')[1] p.value = file_path.split('_')[2].split('.')[0] p.origin = file_path p.source = 'carinet' return p
def parse_facebook_csv(file_path): result = [] if not file_path.endswith('.csv'): return result with open(file_path, 'r') as csvfile: reader = csv.DictReader(csvfile) for row in reader: p = Post() p.origin = file_path p.date = row['status_published'] p.value = str.lower(row['status_message'] + row['link_name']) p.source = 'facebook' result.append(p) return result
def translate_data_to_post(data_list, file_path): result = [] if data_list is None: return result for data in data_list: if "message" in data and EXTRACT_POST_MESSAGE: p = Post() p.date = data["created_time"] p.value = str.lower(data["message"]) p.source = '_'.join(ntpath.basename(file_path).split("__")[:2]) p.origin = file_path result.append(p) if "comments" in data and len(data["comments"]) > 0: result += translate_data_to_post(data["comments"], file_path) return result
def parse_jbtalks(file_name): result = [] result.append(create_post_from(file_name)) date = result[0].date with open(file_name, 'r', encoding='utf8') as csvfile: reader = csv.DictReader(csvfile) for row in reader: p = Post() p.origin = file_name p.date = date p.value = HanziConv.toSimplified(row['text'].replace("\n", "").strip()) p.source = 'jbtalks' result.append(p) return result
def parse_blog(file_path): result = [] with open(file_path, 'r', errors='ignore') as csvfile: reader = csv.DictReader(csvfile) for row in reader: date = row['column1'] for key, value in row.items(): if (key in ['column1', 'column2'] or value == ''): continue p = Post() p.date = date p.origin = file_path p.value = str.lower(value) p.source = 'blog' result.append(p) return result
def parse_carinet(file_path): result = [] result.append(read_title(file_path)) with open(file_path, 'r', errors='ignore') as csvfile: df = pandas.read_csv(csvfile, skiprows=[0], error_bad_lines=False) for index, row in df.iterrows(): p = Post() p.origin = file_path p.date = str(row['date']) if (type(row['text']) is not str): continue p.value = HanziConv.toSimplified(row['text'].replace("\n", "").strip()) p.source = 'carinet' if (isinstance(p.value, str)): result.append(p) return result
def parse_lowyat(file_path): if ("links_retrieved" in file_path): return [Post()] result = [] # try: result.append(read_title(file_path)) with open(file_path, 'r', errors='ignore') as csvfile: df = pandas.read_csv(csvfile, skiprows=[0]) for index, row in df.iterrows(): if type(row['text']) is not str: continue p = Post() p.date = str(row['date']) p.value = str.lower(row['text']) p.origin = file_path p.source = 'lowyat' if (isinstance(p.value, str)): result.append(p) return result
def parse_twitter(file_path): with open(file_path, 'r', errors='ignore') as file: first_line = file.readline() if '< HEAD' not in first_line: file.seek( 0 ) # This to make sure the file pointer go backs to the first line result = [] df = pandas.read_csv(file) for index, row in df.iterrows(): if type(row['text']) is not str: continue p = Post() p.date = str(row['created_at']) p.value = str.lower(row['text']) p.source = 'twitter' p.origin = file_path if (isinstance(p.value, str)): result.append(p) return result