Esempio n. 1
0
def insert_terms(db_path=None, db_url=None, input_path=None):
    assert (db_path is None and db_url is not None) or (db_path is not None
                                                        and db_url is None)
    assert input_path is not None

    if db_path:
        engine = get_engine(db_path=db_path)
    else:
        engine = get_engine(db_url=db_url)
    # session = get_session(engine)
    con = engine.connect()

    meta = MetaData(engine)
    term_table = Table('terms', meta, autoload=True)
    with open(input_path, mode='r', encoding='utf-8') as file:
        insert_list = []
        for line in file:
            line_split = line.strip().split('\t')
            term1 = line_split[0]
            if len(line_split) > 1:
                term2 = line_split[-1]
            else:
                term2 = None
            insert_list.append({'term': term1, 'term2': term2})

    con.execute(term_table.insert(), insert_list)
    con.close()
Esempio n. 2
0
def get_split_title_keyword_abstract(db_path=None,
                                     db_url=None,
                                     output_path='',
                                     foreground=False):
    assert (db_path is None and db_url is not None) or (db_path is not None
                                                        and db_url is None)
    assert output_path is not None

    if db_path:
        engine = get_engine(db_path=db_path)
    else:
        engine = get_engine(db_url=db_url)
    session = get_session(engine)

    data = session.query(WosDocument).all()
    path = r'C:/Users/Tom/Desktop/bio_nature'
    if foreground:
        inner_path = path + '/foreground'
    else:
        inner_path = path + '/background'

    for document in data:
        title = document.title.strip() + '.'

        kw_str = ''
        # kw_str = ', '.join(document.keywords)
        for kw in document.keywords:
            kw_str += kw.keyword + '. '
        # kw_str = kw_str[:-2]

        kp_str = ''
        # kp_str = ', '.join(document.keyword_plus)
        for kp in document.keyword_plus:
            kp_str += kp.keyword_plus + '. '
        # kp_str = kp_str[:-2]

        if document.abs:
            abs_str = document.abs.replace('. ', '.\n')
        else:
            abs_str = ''
        out_str = '\n'.join([title, kw_str, kp_str, abs_str])
        filename = inner_path + '/{}-{}.txt'.format(document.unique_id,
                                                    document.pub_year)
        with open(filename, mode='w', encoding='utf-8') as file:
            file.write(out_str)

        with open(path +
                  ('/foreground.list' if foreground else '/background.list'),
                  mode='a',
                  encoding='utf-8') as l:
            l.write(
                ('foreground' if foreground else 'background') +
                '/{}-{}.txt\n'.format(document.unique_id, document.pub_year))
Esempio n. 3
0
def parse(input_dir=None, db_path=None, db_url=None):
    assert input_dir is not None and (db_path is not None
                                      or db_url is not None)

    init_set = set()

    for root, dirs, files in os.walk(input_dir):
        for file in files:
            if file[-4:] == '.txt':
                exist_set = parse_single(os.path.join(root, file), db_path,
                                         db_url, init_set)
                init_set = init_set.union(exist_set)

    # 最后处理内部引证关系
    print('开始处理内部引证关系……')
    engine = get_engine(db_path, db_url)
    Base.metadata.create_all(engine)
    session = get_session(engine)

    session.execute(
        'INSERT INTO wos_inner_reference '
        'SELECT DISTINCT t1.document_unique_id AS citing_paper_id, t2.unique_id AS cited_paper_id '
        'FROM wos_reference t1 INNER JOIN wos_document t2 '
        'ON t1.document_md5 = t2.document_md5 OR t1.doi = t2.doi '
        'ORDER BY citing_paper_id, cited_paper_id')
    session.commit()
    session.execute(
        'DELETE FROM wos_inner_reference WHERE citing_paper_id = cited_paper_id'
    )
    session.commit()
    session.close()

    print('全部解析完成')
Esempio n. 4
0
def draw_cooccurrence_network(net_type=None,
                              db_path=None,
                              output_path=None,
                              top_n=30):
    assert net_type is not None and output_path is not None and db_path is not None

    engine = get_engine(db_path)
    session = get_session(engine)

    print('正在处理共现数据')
    graph_data = []
    data = []
    title = None
    if net_type == 'keyword':
        title = 'Author Keyword Co-occurrence Network'
        data = session.query(WosDocument.unique_id,
                             func.group_concat(WosKeyword.keyword, ';'))\
                                .join(WosKeyword).group_by(WosDocument.unique_id)
        filter_data = session.query(WosKeyword.keyword, func.count('*').label('num')) \
            .group_by(WosKeyword.keyword).order_by(desc('num'))
    elif net_type == 'keyword_plus':
        title = 'WoS Keyword Co-occurrence Network'
        data = session.query(WosDocument.unique_id,
                             func.group_concat(WosKeywordPlus.keyword_plus, ';'))\
                                .join(WosKeywordPlus).group_by(WosDocument.unique_id)
        filter_data = session.query(WosKeywordPlus.keyword_plus, func.count('*').label('num')) \
            .group_by(WosKeywordPlus.keyword_plus).order_by(desc('num'))
    elif net_type == 'author':
        title = 'Author Co-authorship Network'
        data = session.query(WosDocument.unique_id,
                             func.group_concat(WosAuthor.last_name +','+ WosAuthor.first_name, ';'))\
                                .join(WosAuthor).group_by(WosDocument.unique_id)
        filter_data = session.query(WosAuthor.last_name + ',' + WosAuthor.first_name, func.count('*').label('num')) \
            .group_by(WosAuthor.last_name + ',' + WosAuthor.first_name).order_by(desc('num'))

    else:
        print('未考虑到的作图情况:', net_type)
        exit(-1)

    for row in data:
        row_split = row[1].split(';')
        if len(row_split) > 1:
            graph_data += list(combinations(row_split, 2))

    # network是包含了全部关键词的共现网络
    print('正在生成共现网络')
    network = get_network(graph_data, directed=False)

    session.close()

    nx.write_graphml(network, 'test.gml')

    filter_nodes = [i[0] for i in filter_data[top_n:]]
    sub = nx.restricted_view(network, filter_nodes, [])

    # 最大联通子图
    # sub = sorted(nx.connected_component_subgraphs(sub), key = len, reverse=True)[0]

    # print('正在绘图')
    draw_net(sub, title=title, output_path=os.path.join(output_path, net_type))
Esempio n. 5
0
def save_transfer(transfer: Dict):
    transfer_id = None

    try:
        with Session(model.get_engine()) as session:
            with session.begin():
                operation_id = transfer['operation_id']

                operation = session.query(model.Operation).get(operation_id)

                operation.state = transfer['next_state']

                transfer = model.Transfer(tran_id=transfer['tranId'],
                                          type=transfer['type'],
                                          asset=transfer['asset'],
                                          amount=transfer['amount'],
                                          state='FILLED')

                operation.transfer = transfer

            transfer_id = transfer.id

    except Exception as ex:
        print(f"Error al guardar Transferencia = {transfer}")
        print(ex)
        traceback.print_stack()

    return transfer_id
Esempio n. 6
0
def save_future_sell(future_order_dict: Dict):
    future_sell_id = None

    try:
        with Session(model.get_engine()) as session:
            with session.begin():
                operation_id = future_order_dict['operation_id']

                operation = session.query(model.Operation).get(operation_id)

                operation.state = 'FUTURE_SELL'

                order_id = future_order_dict['orderId']

                future_order = session.query(model.FutureOrder).filter_by(order_id=order_id).first()

                future_sell = model_helper.sync_future_order(future_order_dict, future_order)

                operation.future_order = future_sell

            future_sell_id = future_sell.id

    except Exception as ex:
        print(f"Error al guardar Venta de futuro = {future_order_dict}")
        print(ex)
        traceback.print_stack()

    return future_sell_id
Esempio n. 7
0
def save_operation_buy_spot(position_dict, spot_order_dict):
    with Session(model.get_engine()) as session:
        with session.begin():
            # spot_order
            order_id = spot_order_dict['orderId']

            spot_order = session.query(
                model.SpotOrder).filter_by(order_id=order_id).first()
            spot_order = model_helper.sync_spot_order(spot_order_dict,
                                                      spot_order)

            # operation
            operation = spot_order.operation

            # position
            position = operation.position if operation else None

            if not position:
                position_id = position_dict['position_id']

                if position_id:
                    position = session.query(model.Position).get(position_id)
                else:
                    position = model.Position()
                    session.add(position)

            position = model_helper.sync_position(
                {
                    **position_dict, 'state': 'CREATED'
                }, position)

            if not operation:
                operation = model.Operation()

                position.operations.append(operation)

                operation.spot_order = spot_order

            operation = model_helper.sync_operation(
                {
                    **position_dict, 'kind': 'OPEN',
                    'state': 'SPOT_BUY'
                }, operation)
def save_spot_sell(spot_order: Dict):
    spot_sell_id = None

    with Session(model.get_engine()) as session:
        with session.begin():
            operation_id = spot_order['operation_id']

            operation = session.query(model.Operation).get(operation_id)

            operation.state = 'SPOT_SELL'

            operation.position.state = 'CLOSING'

            # spot_sell = model_helper.sync_spot_order(spot_order)

            # operation.spot_order = spot_sell

        # spot_sell_id = spot_sell.id

    return spot_sell_id
Esempio n. 9
0
def task_avg_ratio(tickers, field, quantity):
    engine = model.get_engine()
    engine.dispose()

    sleep_time = quantity / 10 * 60 / len(tickers)

    for ticker in tickers:
        avg = model_service.get_data_ratio(engine, ticker=ticker, quantity=quantity)
        model_service.save_avg_ratio(engine, ticker, field, avg)

    while app.running:
        for ticker in tickers:
            try:
                avg = model_service.get_data_ratio(engine, ticker=ticker, quantity=quantity)
                model_service.save_avg_ratio(engine, ticker, field, avg)
            except Exception as ex:
                print(field)
                print(ex)

            time.sleep(sleep_time)
Esempio n. 10
0
def save_future_buy(operation_dict: Dict, future_order_dict: Dict):
    future_buy_id = None

    with Session(model.get_engine()) as session:
        with session.begin():
            position_id = operation_dict['position_id']

            position = session.query(model.Position).get(position_id)

            position.state = 'CLOSING'
            position.message = ''

            operation = model_helper.sync_operation({
                **operation_dict, 'kind': 'CLOSE',
                'state': 'FUTURE_BUY'
            })

            position.operations.append(operation)

            order_id = future_order_dict['orderId']

            future_order = session.query(
                model.FutureOrder).filter_by(order_id=order_id).first()

            if future_order:
                print(
                    f"Encontre future order = {future_order.id}, {future_order.status}"
                )

            future_buy = model_helper.sync_future_order(
                future_order_dict, future_order)

            operation.future_order = future_buy

            # future_object = session.query(model.Future).get(operation.future)
            # future_object.balance.outdated = True

        future_buy_id = future_buy.id

    return future_buy_id
Esempio n. 11
0
def parse_single(input_file=None, db_path=None, db_url=None, exist_set=None):
    assert input_file is not None and (db_path is not None or db_url
                                       is not None) and exist_set is not None

    engine = get_engine(db_path, db_url)
    Base.metadata.create_all(engine)
    session = get_session(engine)

    volume_pattern = re.compile(r'^v\d+$')
    page_pattern = re.compile(r'^p\w*\d+$')
    doi_pattern = re.compile(r'^doi \d+.+$')
    year_pattern = re.compile(r'^\d{4}$')

    print('正在解析{}……'.format(input_file))

    with open(input_file, 'r', encoding='utf-8') as file:
        cur_field = None
        author_dict = {}
        initials_list = []
        author_order = 1
        wos_document = WosDocument()
        wos_document_list = []

        # 有些换行了的字段先暂存,等到最后再处理
        journal_line = None
        wos_category_line = None
        research_area_line = None
        keyword_line = None
        keyword_plus_line = None
        funding_line = None

        for line in file.readlines():
            line = line[:-1].lower()

            # 通过每一行的前三个字符来识别行的状态
            tmp = line[:3]
            if tmp != '   ':
                cur_field = tmp

            if cur_field == 'pt ':
                del wos_document
                wos_document = WosDocument()
            # 获得作者的缩写
            elif cur_field == 'au ':
                if tmp == cur_field:
                    initials_list.clear()
                full_name = line[3:]
                initials_list.append(full_name)
            # 解析作者得到作者顺序,此时尚未绑定通讯作者和地址
            # 需要处理用空格分割的特殊情况
            # 还需要处理匿名作者等无分割情况
            # 还需要处理团体作者单独放在CA字段的情况
            elif cur_field == 'af ':
                if tmp == cur_field:
                    author_dict.clear()
                    author_order = 1
                full_name = line[3:]
                try:
                    pos = full_name.index(',')
                except:
                    try:
                        pos = full_name.index(' ')
                    except:
                        pos = len(full_name)
                author = WosAuthor(
                    full_name[pos + 1:], full_name[:pos].strip(),
                    initials_list[author_order - 1].replace(',', ''),
                    author_order, 0)
                author.document = wos_document
                author_dict[full_name] = author

                if author_order == 1:
                    wos_document.first_author = initials_list[author_order -
                                                              1].replace(
                                                                  ',', '')

                author_order += 1
            elif cur_field == 'ca ':
                group_author = line[3:]
                initials_list.append(group_author)
                author = WosAuthor(
                    group_author, None,
                    initials_list[author_order - 1].replace(',', ''),
                    author_order, 0)
                author_dict[group_author] = author

                if author_order == 1:
                    wos_document.first_author = initials_list[author_order -
                                                              1].replace(
                                                                  ',', '')

                author_order += 1
            elif cur_field == 'c1 ':
                # 将机构地址绑定到前面提取到的作者上
                author_affiliation = line[3:]
                try:
                    pos = author_affiliation.index(']')
                except:
                    # print('存在没有作者的机构{},已抛弃'.format(author_affiliation))
                    continue
                authors = author_affiliation[1:pos].split('; ')
                for author in authors:
                    affiliation = WosAffiliation(author_affiliation[pos +
                                                                    2:-1])
                    affiliation.author = author_dict[author]
            elif cur_field == 'rp ':
                # 确定通讯作者
                rp_author_affiliations = line[3:].split('; ')
                for rp_author_affiliation in rp_author_affiliations:
                    try:
                        pos = rp_author_affiliation.index(' (')
                        rp_author = rp_author_affiliation[:pos]
                    except:
                        rp_author = rp_author_affiliation
                    try:
                        rp_index = initials_list.index(rp_author) + 1
                    except:
                        rp_index = 1
                    for author in author_dict.keys():
                        if author_dict[author].author_order == rp_index:
                            author_dict[author].is_reprint_author = 1
            elif cur_field == 'ti ':
                title = line[3:]
                if wos_document.title is not None:
                    wos_document.title += ' ' + title
                else:
                    wos_document.title = title
            elif cur_field == 'so ':
                if journal_line is not None:
                    journal_line += ' ' + line[3:]
                else:
                    journal_line = line[3:]
            elif cur_field == 'la ':
                wos_document.language = line[3:]
            elif cur_field == 'dt ':
                wos_document.document_type = line[3:]
            elif cur_field == 'de ':
                if keyword_line is not None:
                    keyword_line += ' ' + line[3:]
                else:
                    keyword_line = line[3:]
            elif cur_field == 'id ':
                if keyword_plus_line is not None:
                    keyword_plus_line += ' ' + line[3:]
                else:
                    keyword_plus_line = line[3:]
            elif cur_field == 'ab ':
                if wos_document.abs is not None:
                    wos_document.abs += ' ' + line[3:]
                else:
                    wos_document.abs = line[3:]
            elif cur_field == 'em ':
                wos_document.author_email = line[3:].replace(' ', '')
            elif cur_field == 'fu ':
                if funding_line is not None:
                    funding_line += ' ' + line[3:]
                else:
                    funding_line = line[3:]
            elif cur_field == 'fx ':
                if wos_document.funding_text is not None:
                    wos_document.funding_text += ' ' + line[3:]
                else:
                    wos_document.funding_text = line[3:]
            elif cur_field == 'cr ':
                # 解析参考文献

                reference = line[3:]
                ref_split = reference.split(', ')
                first_author = None
                pub_year = None
                journal = None
                volume = None
                start_page = None
                doi = None

                year_flag = False

                if len(ref_split) < 2:
                    journal = ref_split[0]
                else:
                    i_list = []
                    for i_part in range(len(ref_split)):
                        volume_match = volume_pattern.match(ref_split[i_part])
                        page_match = page_pattern.match(ref_split[i_part])
                        doi_match = doi_pattern.match(ref_split[i_part])
                        if not year_flag:
                            year_match = year_pattern.match(ref_split[i_part])
                        else:
                            year_match = None

                        if year_match:
                            pub_year = ref_split[i_part]
                            i_list.append(i_part)
                            year_flag = True
                        elif volume_match:
                            volume = ref_split[i_part][1:]
                            i_list.append(i_part)
                        elif page_match:
                            start_page = ref_split[i_part][1:]
                            i_list.append(i_part)
                        elif doi_match:
                            doi = ref_split[i_part].replace(
                                'doi ', '').replace('[', '').replace(']', '')
                            i_list.append(i_part)

                    i_list.sort()
                    if len(i_list) > 0:
                        if min(i_list) > 0:
                            first_author = ref_split[0]
                        start_pos = None
                        end_pos = None
                        pos = 0
                        while pos < len(i_list) - 1:
                            if i_list[pos + 1] - i_list[pos] > 1:
                                start_pos = i_list[pos] + 1
                            if start_pos is not None and i_list[
                                    pos + 1] - i_list[pos] == 1:
                                end_pos = i_list[pos]
                                break
                            pos += 1
                        if start_pos is not None or end_pos is not None:
                            if start_pos == end_pos:
                                journal = ref_split[start_pos]
                            elif end_pos is None:
                                journal = ', '.join(
                                    ref_split[start_pos:i_list[-1]])
                            else:
                                journal = ', '.join(
                                    ref_split[start_pos:end_pos])

                        else:
                            if year_flag:
                                try:
                                    journal = ref_split[i_list[-1] + 1]
                                except:
                                    journal = None
                            else:
                                journal = ref_split[i_list[0] - 1]
                    else:
                        first_author = ref_split[0]
                        journal = ref_split[1]

                # 由于参考文献字段非常不规范,经常超长,所以使用截断
                if first_author is not None and len(first_author) > 254:
                    first_author = first_author[:254]
                if journal is not None and len(journal) > 254:
                    journal = journal[:254]

                ref = WosReference(
                    first_author.replace('.', '').replace('. ', '').replace(
                        ',', '') if first_author else first_author, pub_year,
                    journal, volume, start_page, doi)
                ref.document = wos_document

            elif cur_field == 'nr ':
                wos_document.reference_num = int(line[3:])
            elif cur_field == 'tc ':
                wos_document.cited_times = int(line[3:])
            elif cur_field == 'u1 ':
                wos_document.usage_180 = int(line[3:])
            elif cur_field == 'u2':
                wos_document.usage_since_2013 = int(line[3:])
            elif cur_field == 'pu ':
                wos_document.publisher = line[3:]
            elif cur_field == 'ji ':
                wos_document.journal_iso = line[3:]
            elif cur_field == 'j9 ':
                wos_document.journal_29 = line[3:]
            elif cur_field == 'pd ':
                wos_document.pub_month_day = line[3:]
            elif cur_field == 'py ':
                wos_document.pub_year = line[3:]
            elif cur_field == 'vl ':
                wos_document.volume = line[3:]
            elif cur_field == 'is ':
                wos_document.issue = line[3:]
            elif cur_field == 'bp ':
                wos_document.start_page = line[3:]
            elif cur_field == 'ep ':
                wos_document.end_page = line[3:]
            elif cur_field == 'di ':
                wos_document.doi = line[3:]
            elif cur_field == 'wc ':
                if wos_category_line is not None:
                    wos_category_line += ' ' + line[3:]
                else:
                    wos_category_line = line[3:]
            elif cur_field == 'sc ':
                if research_area_line is not None:
                    research_area_line += ' ' + line[3:]
                else:
                    research_area_line = line[3:]
            elif cur_field == 'ut ':
                wos_document.unique_id = line[7:]
            elif cur_field == 'er':
                # 在最后一行处理多行字段的问题
                if journal_line is not None:
                    wos_document.journal = journal_line
                    journal_line = None

                if keyword_line is not None:
                    keywords = keyword_line.split('; ')
                    for keyword in keywords:
                        if len(keyword) > 254:
                            keyword = keyword[:254]
                        key = WosKeyword(keyword)
                        key.document = wos_document
                    keyword_line = None

                if keyword_plus_line is not None:
                    keyword_plus = keyword_plus_line.split('; ')
                    for kp in keyword_plus:
                        if len(kp) > 254:
                            kp = kp[:254]
                        keyp = WosKeywordPlus(kp)
                        keyp.document = wos_document
                    keyword_plus_line = None

                if wos_category_line is not None:
                    categories = wos_category_line.split('; ')
                    for category in categories:
                        if len(category) > 254:
                            category = category[:254]
                        cat = WosCategory(category)
                        cat.document = wos_document
                    wos_category_line = None

                if research_area_line is not None:
                    areas = research_area_line.split('; ')
                    for area in areas:
                        if len(area) > 254:
                            area = area[:254]
                        a = WosResearchArea(area)
                        a.document = wos_document
                    research_area_line = None

                if funding_line is not None:
                    fundings = funding_line.split('; ')
                    for fund in fundings:
                        pos = find_nth(fund, '[', -1)
                        if pos != -1:
                            funding = [fund[:pos], fund[pos:]]
                            agent = funding[0]
                            numbers = funding[1].replace('[', '').replace(
                                ']', '').split(', ')
                            for number in numbers:
                                f = WosFunding(agent, number)
                                f.document = wos_document
                        else:
                            agent = fund
                            f = WosFunding(agent, None)
                            f.document = wos_document
                    funding_line = None
                wos_document.document_md5 = document_hash(wos_document)

                # TODO:排除非article和review文献,用完记得删除
                # if (not 'article' in wos_document.document_type and not 'review' in wos_document.document_type) \
                #         or 'early access' in wos_document.document_type or 'retracted' in wos_document.document_type\
                #         or 'software' in wos_document.document_type or 'hardware' in wos_document.document_type\
                #         or 'exhibit' in wos_document.document_type or 'database' in wos_document.document_type\
                #         or 'book' in wos_document.document_type:
                #     continue

                # 统一处理一下超长截断问题
                if len(wos_document.title) > 499:
                    wos_document.title = wos_document.title[:499]
                if wos_document.unique_id in exist_set:
                    continue
                else:
                    exist_set.add(wos_document.unique_id)
                    wos_document_list.append(wos_document)

    print('解析{}完成,正在写入数据库……'.format(input_file))
    session.add_all(wos_document_list)
    session.commit()
    session.close()
    print('插入{}完成\n'.format(input_file))
    return exist_set
import traceback

import keys

import time

import app
from model_service import sync_spot_prices_calc, spot_symbols_with_futures

import model

import config

cache = dict()

engine = model.get_engine()


def price_risk_safe(symbol, book, risk=config.RISK, safe=config.SAFE):
    sum_total_risk, sum_total_safe, sum_size_risk, sum_size_safe = 0, 0, 0, 0
    sum_provisorio_total, sum_provisorio_size = 0, 0
    price_risk, price_safe = 0, 0

    # print(book)

    for i in book:

        size = float(i[1])
        price = float(i[0])

        sum_provisorio_total += price * size
Esempio n. 13
0
def resetFutureBalance(future_symbol):
    with Session(model.get_engine()) as session, session.begin():
        future_object = session.query(model.Future).get(future_symbol)
        future_object.balance.outdated = True
Esempio n. 14
0
            'R': False,
            'wt': 'CONTRACT_PRICE',
            'ot': 'MARKET',
            'ps': 'BOTH',
            'cp': False,
            'ma': 'BTC',
            'rp': '0.00038223',
            'pP': False,
            'si': 0,
            'ss': 0
        }
    }

    future_order_dict = future_order_dict['o']

    with Session(model.get_engine()) as session:
        with session.begin():

            order_id = future_order_dict['i']

            future_order = session.query(
                model.FutureOrder).filter_by(order_id=order_id).first()

            if future_order:
                print(
                    f"Encontre future order = {future_order.id}, {future_order.status}"
                )

            future_buy = model_helper.sync_future_order(
                future_order_dict, future_order)
Esempio n. 15
0
def parse_single(input_file=None, db_path=None, db_url=None):
    assert input_file is not None and (db_path is not None
                                       or db_url is not None)

    engine = get_engine(db_path, db_url)
    Base.metadata.create_all(engine)
    session = get_session(engine)

    print('正在解析{}……'.format(input_file))

    # TODO:被引数、利用数等实时指标在官方导出的格式里面没有。作者邮箱因为太过详细暂时不解析

    # 用ElementTree读取XML文件,其中root标签是records
    # 因为WOS的XML文件带有name_space,在查找节点时需要加进去
    tree = ET.parse(input_file)
    records = tree.getroot()
    name_space = records.tag[:records.tag.index('}') + 1]
    wos_document_list = []

    for record in records:
        wos_document = WosDocument()

        wos_document.unique_id = get_unique_id(name_space, record)
        wos_document.title = get_title(name_space, record)
        wos_document.abs = get_abs(name_space, record)
        wos_document.journal = get_journal(name_space, record)
        wos_document.journal_iso = get_journal_iso(name_space, record)
        wos_document.journal_29 = get_journal_29(name_space, record)
        wos_document.publisher = get_publisher(name_space, record)
        wos_document.volume = get_volume(name_space, record)
        wos_document.issue = get_issue(name_space, record)
        wos_document.start_page = get_start_page(name_space, record)
        wos_document.end_page = get_end_page(name_space, record)
        wos_document.pub_year = get_pub_year(name_space, record)
        wos_document.pub_month_day = get_pub_month_day(name_space, record)
        wos_document.document_type = get_document_type(name_space, record)
        wos_document.doi = get_doi(name_space, record)
        wos_document.reference_num = get_reference_num(name_space, record)
        wos_document.funding_text = get_funding_text(name_space, record)
        wos_document.language = get_language(name_space, record)

        # print(wos_document)

        authors = record.find(
            './{0}static_data/{0}summary/{0}names'.format(name_space))
        wos_document.authors = get_authors(name_space, authors, record)

        # print(wos_document.authors)

        references = record.find(
            './{0}static_data/{0}fullrecord_metadata/{0}references'.format(
                name_space))
        wos_document.references = get_references(name_space, references)
        # print(wos_document.references)

        categories = record.findall(
            './{0}static_data/{0}fullrecord_metadata/{0}category_info/{0}subjects/{0}subject[@ascatype="traditional"]'
            .format(name_space))
        wos_document.categories = get_categories(name_space, categories)
        # print(wos_document.categories)

        areas = record.findall(
            './{0}static_data/{0}fullrecord_metadata/{0}category_info/{0}subjects/{0}subject[@ascatype="extended"]'
            .format(name_space))
        wos_document.research_areas = get_research_areas(name_space, areas)
        # print(wos_document.research_areas)

        keywords = record.find(
            './{0}static_data/{0}fullrecord_metadata/{0}keywords'.format(
                name_space))
        wos_document.keywords = get_keywords(name_space, keywords)
        # print(wos_document.keywords)

        keyword_plus = record.find(
            './{0}static_data/{0}item/{0}keywords_plus'.format(name_space))
        wos_document.keyword_plus = get_keyword_plus(name_space, keyword_plus)
        # print(wos_document.keyword_plus)

        fundings = record.find(
            './{0}static_data/{0}fullrecord_metadata/{0}fund_ack/{0}grants'.
            format(name_space))
        wos_document.fundings = get_fundings(name_space, fundings)
        # print(wos_document.fundings)

        wos_document_list.append(wos_document)

        # 及时写入清空队列
        if len(wos_document_list) > 499:
            print('缓存队列达到阈值,正在写入数据库……')
            session.add_all(wos_document_list)
            session.commit()
            wos_document_list.clear()
        # print()

    print('解析{}完成,正在写入数据库……'.format(input_file))
    session.add_all(wos_document_list)
    session.commit()
    session.close()
    print('插入{}完成\n'.format(input_file))
Esempio n. 16
0
def parse_single(input_file=None, db_path=None, db_url=None):
    assert input_file is not None and (db_path is not None
                                       or db_url is not None)

    print('正在解析{}……'.format(input_file))

    bibtex_filename = input_file

    with open(bibtex_filename, 'r', encoding='utf-8') as file:
        parser = BibTexParser()
        parser.customization = customizations
        bib_db = bibtexparser.load(file, parser=parser)

    # print(len(bib_db.entries))
    # exit(-1)

    # for k,v in bib_db.entries[0].items():
    #     print(k,v)
    #     print('======\n')
    # exit(0)

    # if len(bib_db.entries) != 500:
    #     exit(-1)

    engine = get_engine(db_path, db_url)
    Base.metadata.create_all(engine)
    session = get_session(engine)

    for i in range(len(bib_db.entries)):
        author_list = []
        category_list = []
        area_list = []
        keyword_list = []
        keyword_plus_list = []
        reference_list = []
        funding_list = []

        # 解析文章基本信息 wos_document表的信息
        try:
            wos_document = WosDocument(
                bib_db.entries[i]['unique-id'][5:-1].lower()
                if 'unique-id' in bib_db.entries[i] else None,
                bib_db.entries[i]['title'][1:-1].lower().replace(
                    '\n', ' ').replace('\\', '')
                if 'title' in bib_db.entries[i] else None,
                bib_db.entries[i]['abstract'][1:-1].lower().replace(
                    '\n', ' ').replace('\\', '')
                if 'abstract' in bib_db.entries[i] else None,
                bib_db.entries[i]['journal'][1:-1].lower().replace('\\', '')
                if 'journal' in bib_db.entries[i] else bib_db.entries[i]
                ['booktitle'][1:-1].lower().replace('\n', ' ').replace(
                    '\\', '') if 'booktitle' in bib_db.entries[i] else None,
                bib_db.entries[i]['journal-iso'][1:-1].lower().replace(
                    '\\', '') if 'journal-iso' in bib_db.entries[i] else None,
                # bibtex格式不存在29字符格式的期刊缩写
                None,
                bib_db.entries[i]['publisher'][1:-1].lower().replace('\\', '')
                if 'publisher' in bib_db.entries[i] else None,
                bib_db.entries[i]['volume'][1:-1].lower()
                if 'volume' in bib_db.entries[i] else None,
                bib_db.entries[i]['number'][1:-1].lower()
                if 'number' in bib_db.entries[i] else None,
                bib_db.entries[i]['pages'][1:-1].lower().split('-')[0]
                if 'pages' in bib_db.entries[i] and
                len(bib_db.entries[i]['pages'][1:-1].lower().split('-')) > 1
                else bib_db.entries[i]['pages'][1:-1].lower().split('+')[0]
                if 'pages' in bib_db.entries[i] else None,
                bib_db.entries[i]['pages'][1:-1].lower().split('-')[1]
                if 'pages' in bib_db.entries[i] and
                len(bib_db.entries[i]['pages'][1:-1].lower().split('-')) > 1
                else '+' if 'pages' in bib_db.entries[i] else None,
                bib_db.entries[i]['year'][1:-1].lower()
                if 'year' in bib_db.entries[i] else None,
                bib_db.entries[i]['month'][1:-1].lower()
                if 'month' in bib_db.entries[i] else None,
                bib_db.entries[i]['type'][1:-1].lower()
                if 'type' in bib_db.entries[i] else None,
                bib_db.entries[i]['doi'][1:-1].lower()
                if 'doi' in bib_db.entries[i] else None,
                bib_db.entries[i]['times-cited'][1:-1].lower()
                if 'times-cited' in bib_db.entries[i] else None,
                bib_db.entries[i]['number-of-cited-references'][1:-1].lower()
                if 'number-of-cited-references' in bib_db.entries[i] else None,
                bib_db.entries[i]['usage-count-last-180-days'][1:-1].lower()
                if 'usage-count-last-180-days' in bib_db.entries[i] else None,
                bib_db.entries[i]['usage-count-since-2013'][1:-1].lower()
                if 'usage-count-since-2013' in bib_db.entries[i] else None,
                bib_db.entries[i]['funding-text'][1:-1].lower().replace(
                    '\n', ' ').replace('\\', '')
                if 'funding-text' in bib_db.entries[i] else None,
                bib_db.entries[i]['language'][1:-1].lower()
                if 'language' in bib_db.entries[i] else None,
                bib_db.entries[i]['author-email'][1:-1].lower().replace(
                    '\n', ';').replace('\\', '')
                if 'author-email' in bib_db.entries[i] else None)
        except Exception as e:
            print(bib_db.entries[i])
            print('该行出现故障', e)
            exit(-1)
        # TODO: 暂时把格式错误的文章删去
        if wos_document.unique_id is None:
            print('{} 文件存在格式不正确的记录,已跳过该记录'.format(input_file))
            continue

        # 解析作者及机构的信息
        if bib_db.entries[i]['affiliation'] is not None:
            for author_info, addresses in bib_db.entries[i][
                    'affiliation'].items():

                affiliation_list = []
                # bibtex格式无法找到规范缩写
                author = WosAuthor(author_info[0], author_info[1], None,
                                   author_info[2], author_info[3])
                session.add(author)
                session.flush()

                if addresses is None:
                    no_affiliation = WosAffiliation(None)
                    affiliation_list.append(no_affiliation)
                else:
                    for address in addresses:
                        affiliation = WosAffiliation(address)
                        affiliation_list.append(affiliation)

                author.affiliations = affiliation_list

                author_list.append(author)
            wos_document.authors = author_list

        # 解析WoS分类信息
        if bib_db.entries[i]['web-of-science-categories'] is not None:
            for category in bib_db.entries[i]['web-of-science-categories']:
                cat = WosCategory(category)
                category_list.append(cat)
            wos_document.categories = category_list

        # 解析研究领域信息
        if bib_db.entries[i]['research-areas'] is not None:
            for area in bib_db.entries[i]['research-areas']:
                a = WosResearchArea(area)
                area_list.append(a)
            wos_document.research_areas = area_list

        # 解析作者关键词
        if bib_db.entries[i]['keywords'] is not None:
            for keyword in bib_db.entries[i]['keywords']:
                key = WosKeyword(keyword)
                keyword_list.append(key)
            wos_document.keywords = keyword_list

        # 解析WoS KeywordPlus
        if bib_db.entries[i]['keywords-plus'] is not None:
            for keyword_plus in bib_db.entries[i]['keywords-plus']:
                kp = WosKeywordPlus(keyword_plus)
                keyword_plus_list.append(kp)
            wos_document.keyword_plus = keyword_plus_list

        # 解析基金信息
        if bib_db.entries[i]['funding-acknowledgement'] is not None:
            for agent, numbers in bib_db.entries[i][
                    'funding-acknowledgement'].items():
                for number in numbers:
                    fund = WosFunding(agent, number)
                    funding_list.append(fund)
            wos_document.fundings = funding_list

        # 解析参考文献信息
        if bib_db.entries[i]['cited-references'] is not None:
            for reference in bib_db.entries[i]['cited-references']:
                ref = WosReference(reference[0], reference[1], reference[2],
                                   reference[3], reference[4], reference[5])
                reference_list.append(ref)
            wos_document.references = reference_list

        session.add(wos_document)

    print('解析{}完成,正在插入……'.format(input_file))

    session.commit()
    session.close()

    print('插入{}完成\n'.format(input_file))
Esempio n. 17
0
def term_feature_extraction(db_path=None, db_url=None, output_path=''):
    assert (db_path is None and db_url is not None) or (db_path is not None and db_url is None)
    assert output_path is not None

    if db_path:
        engine = get_engine(db_path=db_path)
    else:
        engine = get_engine(db_url=db_url)

    meta = MetaData(engine)
    con = engine.connect()
    session = get_session(engine)
    term_table = Table('terms', meta, autoload=True)

    dataframe = []
    abbr = {}

    query_terms = select([term_table.c.tid, term_table.c.term, term_table.c.term2]).where(
        term_table.c.tid > 9281).order_by(asc(term_table.c.tid))
    result = con.execute(query_terms)
    terms = list(result)
    result.close()

    # with open('terms.csv', mode='w', encoding='utf-8') as _:
    #     _.write('"' + '","'.join(
    #         ['term', 'year', 'document_count', 'document_increment_ratio', 'author_count', 'citation_count',
    #          'funding_count',
    #          'reference_count', 'acc_document_count', 'acc_author_count', 'acc_citation_count',
    #          'acc_funding_count', 'acc_reference_count']) + '"\n')

    with open(r'abbreviations.txt', mode='r', encoding='utf-8') as abbr_file:
        for line in abbr_file:
            line_split = line.strip().split('\t')
            abbr[line_split[2].lower()] = line_split[1].lower()

    for term in terms:
        if len(term[1]) < 6:
            new_term = abbr.get(term[1], term[1])
        else:
            new_term = term[1]
        print(term[0], ':', new_term)

        acc_document_count = 0
        acc_author_count = 0
        acc_reference_count = 0
        acc_funding_count = 0
        acc_citation_count = 0

        last_document_count = 0

        for year in range(2003, 2013):
            document_count = 0
            author_count = 0
            reference_count = 0
            funding_count = 0
            citation_count = 0

            documents = session.query(WosDocument).filter(WosDocument.pub_year == year).all()
            for document in documents:
                keyword_plus = ', '.join(
                    [kp.keyword_plus for kp in document.keyword_plus]) if document.keyword_plus else ''
                keyword = ', '.join([kw.keyword for kw in document.keywords]) if document.keywords else ''
                title = document.title if document.title else ''
                abstract = document.abs if document.abs else ''
                string = ' '.join([keyword_plus, keyword, title, abstract])

                term_pattern = re.compile(r'\b{}s*\b'.format(re.escape(new_term)))
                if term[2]:
                    term2_pattern = re.compile(r'\b{}s*\b'.format(re.escape(term[2])))
                else:
                    term2_pattern = re.compile(r'nothing')
                if term_pattern.search(string) or term2_pattern.search(string):
                    # if new_term in keyword_plus or new_term in keyword or new_term in title or new_term in abstract:
                    document_count += 1
                    author_count += len(document.authors)
                    reference_count += len(document.references)
                    funding_count += len(document.fundings)
                    citation_count += document.cited_times

            acc_document_count += document_count
            acc_author_count += author_count
            acc_reference_count += reference_count
            acc_funding_count += funding_count
            acc_citation_count += citation_count
            document_increment_ratio = (document_count / last_document_count) if last_document_count != 0 else 0
            last_document_count = document_count

            # dataframe.append((new_term, year, document_count,document_increment_ratio, author_count, citation_count, funding_count,
            #                   reference_count, acc_document_count, acc_author_count, acc_citation_count, acc_funding_count, acc_reference_count))

            with open('terms.csv', mode='a', encoding='utf-8') as file:
                file.write('"' + '","'.join(map(str,
                                                [new_term, year, document_count, document_increment_ratio, author_count,
                                                 citation_count, funding_count,
                                                 reference_count, acc_document_count, acc_author_count,
                                                 acc_citation_count, acc_funding_count, acc_reference_count])) + '"\n')

    # df = pd.DataFrame(dataframe, columns=['term','year','document_count','document_increment_ratio','author_count', 'citation_count','funding_count',
    #                                     'reference_count','acc_document_count','acc_author_count','acc_citation_count',
    #                                     'acc_funding_count','acc_reference_count'])
    # df.to_csv('terms.csv', index=None)

    session.close()
    con.close()
Esempio n. 18
0
def resetSpotBalance(asset):
    with Session(model.get_engine()) as session, session.begin():
        spot_balance = session.query(model.SpotBalance).get(asset)
        spot_balance.outdated = True
Esempio n. 19
0
def parse_single(input_file=None, db_path=None, db_url=None):
    assert input_file is not None and (db_path is not None
                                       or db_url is not None)

    engine = get_engine(db_path, db_url)
    Base.metadata.create_all(engine)
    session = get_session(engine)

    print('正在解析{}……'.format(input_file))

    # TODO:被引数、利用数等实时指标在官方导出的格式里面没有。作者邮箱因为太过详细暂时不解析

    # 用ElementTree读取XML文件,其中root标签是records
    # 因为WOS的XML文件带有name_space,在查找节点时需要加进去
    # tree = ET.parse(input_file)
    # records = tree.getroot()
    # name_space = records.tag[:records.tag.index('}')+1]
    wos_document_list = []
    # ns_pattern = re.compile(r'<records xmlns="(.+?)"')
    # name_space = None

    with open(input_file, mode='r', encoding='utf-8') as file:
        single_record = ''
        for line in file:
            if '<REC' in line:
                single_record = line
            elif '</REC>' in line:
                single_record += line

                start = time.time()

                record = ET.fromstring(single_record)

                wos_document = WosDocument()

                wos_document.unique_id = get_unique_id(record)
                wos_document.title = get_title(record)
                wos_document.abs = get_abs(record)
                wos_document.journal = get_journal(record)
                wos_document.journal_iso = get_journal_iso(record)
                wos_document.journal_29 = get_journal_29(record)
                wos_document.publisher = get_publisher(record)
                wos_document.volume = get_volume(record)
                wos_document.issue = get_issue(record)
                wos_document.start_page = get_start_page(record)
                wos_document.end_page = get_end_page(record)
                wos_document.pub_year = get_pub_year(record)
                wos_document.pub_month_day = get_pub_month_day(record)
                wos_document.document_type = get_document_type(record)
                wos_document.doi = get_doi(record)
                wos_document.reference_num = get_reference_num(record)
                wos_document.funding_text = get_funding_text(record)
                wos_document.language = get_language(record)

                # print(wos_document)

                authors = record.find('./static_data/summary/names')
                wos_document.authors = get_authors(authors, record)

                # print(wos_document.authors)

                references = record.find(
                    './static_data/fullrecord_metadata/references')
                wos_document.references = get_references(references)

                # print(wos_document.references)

                categories = record.findall(
                    './static_data/fullrecord_metadata/category_info/subjects/subject[@ascatype="traditional"]'
                )
                wos_document.categories = get_categories(categories)
                # print(wos_document.categories)

                areas = record.findall(
                    './static_data/fullrecord_metadata/category_info/subjects/subject[@ascatype="extended"]'
                )
                wos_document.research_areas = get_research_areas(areas)
                # print(wos_document.research_areas)

                keywords = record.find(
                    './static_data/fullrecord_metadata/keywords')
                wos_document.keywords = get_keywords(keywords)
                # print(wos_document.keywords)

                keyword_plus = record.find('./static_data/item/keywords_plus')
                wos_document.keyword_plus = get_keyword_plus(keyword_plus)
                # print(wos_document.keyword_plus)

                fundings = record.find(
                    './static_data/fullrecord_metadata/fund_ack/grants')
                wos_document.fundings = get_fundings(fundings)
                # print(wos_document.fundings)

                wos_document_list.append(wos_document)

                # 及时写入清空队列
                if len(wos_document_list) > 499:
                    print('缓存队列达到阈值,正在写入数据库……', end='')
                    session.add_all(wos_document_list)
                    session.commit()
                    print(' 完成 - {}秒'.format(time.time() - start))
                    wos_document_list.clear()

                # print()
            elif line == '\n':
                pass
            else:
                single_record += line

    print('解析{}完成,正在写入数据库……'.format(input_file))
    session.add_all(wos_document_list)
    session.commit()
    session.close()
    print('插入{}完成\n'.format(input_file))