예제 #1
0
def create_post_from(file_path):
    p = Post()
    tokens = file_path.split('/')[-1].split('_')
    p.date = tokens[1]
    p.value = tokens[2].split('.')[0].replace(' ', '')
    p.source = 'jbtalks'
    return p
예제 #2
0
def read_title(file_path):
    file_path = file_path.split('/')[-1]
    if (not 'lowyat' in file_path):
        return Post()
    p = Post()
    p.date = file_path.split('_')[1]
    p.value = str.lower(file_path.split('_')[3].split('.')[0])
    p.origin = file_path
    p.source = 'lowyat'
    return p
예제 #3
0
def read_title(file_path):
    file_path = file_path.split('/')[-1]
    if (not 'carinet' in file_path):
        return Post()
    p = Post()
    p.date = file_path.split('_')[1]
    p.value = file_path.split('_')[2].split('.')[0]
    p.origin = file_path
    p.source = 'carinet'
    return p
예제 #4
0
def parse_facebook_csv(file_path):
    result = []
    if not file_path.endswith('.csv'):
        return result
    with open(file_path, 'r') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            p = Post()
            p.origin = file_path
            p.date = row['status_published']
            p.value = str.lower(row['status_message'] + row['link_name'])
            p.source = 'facebook'
            result.append(p)
    return result
예제 #5
0
def translate_data_to_post(data_list, file_path):
    result = []
    if data_list is None:
        return result
    for data in data_list:
        if "message" in data and EXTRACT_POST_MESSAGE:
            p = Post()
            p.date = data["created_time"]
            p.value = str.lower(data["message"])
            p.source = '_'.join(ntpath.basename(file_path).split("__")[:2])
            p.origin = file_path
            result.append(p)
        if "comments" in data and len(data["comments"]) > 0:
            result += translate_data_to_post(data["comments"], file_path)
    return result
예제 #6
0
def parse_jbtalks(file_name):
    result = []
    result.append(create_post_from(file_name))
    date = result[0].date
    with open(file_name, 'r', encoding='utf8') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            p = Post()
            p.origin = file_name
            p.date = date
            p.value = HanziConv.toSimplified(row['text'].replace("\n",
                                                                 "").strip())
            p.source = 'jbtalks'
            result.append(p)
    return result
예제 #7
0
def parse_lowyat(file_path):
    if ("links_retrieved" in file_path):
        return [Post()]
    result = []
    # try:
    result.append(read_title(file_path))
    with open(file_path, 'r', errors='ignore') as csvfile:
        df = pandas.read_csv(csvfile, skiprows=[0])
        for index, row in df.iterrows():
            if type(row['text']) is not str:
                continue
            p = Post()
            p.date = str(row['date'])
            p.value = str.lower(row['text'])
            p.origin = file_path
            p.source = 'lowyat'
            if (isinstance(p.value, str)):
                result.append(p)
    return result
예제 #8
0
def parse_blog(file_path):
    result = []
    with open(file_path, 'r', errors='ignore') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            date = row['column1']
            for key, value in row.items():
                if (key in ['column1', 'column2'] or value == ''):
                    continue
                p = Post()
                p.date = date
                p.origin = file_path
                p.value = str.lower(value)
                p.source = 'blog'
                result.append(p)
    return result
예제 #9
0
def parse_carinet(file_path):
    result = []
    result.append(read_title(file_path))
    with open(file_path, 'r', errors='ignore') as csvfile:
        df = pandas.read_csv(csvfile, skiprows=[0], error_bad_lines=False)
        for index, row in df.iterrows():
            p = Post()
            p.origin = file_path
            p.date = str(row['date'])
            if (type(row['text']) is not str):
                continue
            p.value = HanziConv.toSimplified(row['text'].replace("\n",
                                                                 "").strip())
            p.source = 'carinet'
            if (isinstance(p.value, str)):
                result.append(p)
    return result
예제 #10
0
def parse_twitter(file_path):
    with open(file_path, 'r', errors='ignore') as file:
        first_line = file.readline()
        if '< HEAD' not in first_line:
            file.seek(
                0
            )  # This to make sure the file pointer go backs to the first line
        result = []
        df = pandas.read_csv(file)
        for index, row in df.iterrows():
            if type(row['text']) is not str:
                continue
            p = Post()
            p.date = str(row['created_at'])
            p.value = str.lower(row['text'])
            p.source = 'twitter'
            p.origin = file_path
            if (isinstance(p.value, str)):
                result.append(p)
    return result